Coverage Report

Created: 2026-05-13 15:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/bit_packing.inline.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
// the implement of BitPacking is from impala
19
20
#include <boost/preprocessor/repetition/repeat_from_to.hpp>
21
22
#include "util/bit_packing.h"
23
24
namespace doris {
25
5.32M
inline int64_t BitPacking::NumValuesToUnpack(int bit_width, int64_t in_bytes, int64_t num_values) {
26
    // Check if we have enough input bytes to decode 'num_values'.
27
5.32M
    if (bit_width == 0 ||
28
5.32M
        BitUtil::RoundUpNumBytes((uint32_t)(num_values * bit_width)) <= in_bytes) {
29
        // Limited by output space.
30
5.31M
        return num_values;
31
5.31M
    } else {
32
        // Limited by the number of input bytes. Compute the number of values that can be
33
        // unpacked from the input.
34
10.9k
        return (in_bytes * CHAR_BIT) / bit_width;
35
10.9k
    }
36
5.32M
}
37
38
0
constexpr uint64_t GetMask(int num_bits) {
39
0
    if (num_bits >= 64) {
40
0
        return ~0L;
41
0
    }
42
0
    return (1ULL << num_bits) - 1;
43
0
}
44
45
template <typename T>
46
0
constexpr bool IsSupportedUnpackingType() {
47
0
    return std::is_same<T, uint8_t>::value || std::is_same<T, uint16_t>::value ||
48
0
           std::is_same<T, uint32_t>::value || std::is_same<T, uint64_t>::value;
49
0
}
Unexecuted instantiation: _ZN5doris24IsSupportedUnpackingTypeIjEEbv
Unexecuted instantiation: _ZN5doris24IsSupportedUnpackingTypeIhEEbv
50
51
template <typename OutType>
52
std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
53
                                                            const uint8_t* __restrict__ in,
54
                                                            int64_t in_bytes, int64_t num_values,
55
5.32M
                                                            OutType* __restrict__ out) {
56
5.32M
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
57
58
5.32M
#pragma push_macro("UNPACK_VALUES_CASE")
59
5.32M
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
60
5.35M
    case i:                                     \
61
5.35M
        return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
62
63
5.32M
    switch (bit_width) {
64
        // Expand cases from 0 to 64.
65
5.35M
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
66
0
    default:
67
0
        DCHECK(false);
68
0
        return std::make_pair(nullptr, -1);
69
5.32M
    }
70
5.32M
#pragma pop_macro("UNPACK_VALUES_CASE")
71
5.32M
}
_ZN5doris10BitPacking12UnpackValuesIjEESt4pairIPKhlEiS4_llPT_
Line
Count
Source
55
5.31M
                                                            OutType* __restrict__ out) {
56
5.31M
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
57
58
5.31M
#pragma push_macro("UNPACK_VALUES_CASE")
59
5.31M
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
60
5.31M
    case i:                                     \
61
5.31M
        return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
62
63
5.31M
    switch (bit_width) {
64
        // Expand cases from 0 to 64.
65
0
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
66
0
    default:
67
        DCHECK(false);
68
0
        return std::make_pair(nullptr, -1);
69
5.31M
    }
70
5.31M
#pragma pop_macro("UNPACK_VALUES_CASE")
71
5.31M
}
_ZN5doris10BitPacking12UnpackValuesIhEESt4pairIPKhlEiS4_llPT_
Line
Count
Source
55
9.60k
                                                            OutType* __restrict__ out) {
56
9.60k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
57
58
9.60k
#pragma push_macro("UNPACK_VALUES_CASE")
59
9.60k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
60
9.60k
    case i:                                     \
61
9.60k
        return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
62
63
9.60k
    switch (bit_width) {
64
        // Expand cases from 0 to 64.
65
0
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
66
0
    default:
67
        DCHECK(false);
68
0
        return std::make_pair(nullptr, -1);
69
9.60k
    }
70
9.60k
#pragma pop_macro("UNPACK_VALUES_CASE")
71
9.60k
}
72
73
template <typename OutType, int BIT_WIDTH>
74
std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(const uint8_t* __restrict__ in,
75
                                                            int64_t in_bytes, int64_t num_values,
76
5.35M
                                                            OutType* __restrict__ out) {
77
5.35M
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
5.35M
    constexpr int BATCH_SIZE = 32;
80
5.35M
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
5.35M
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
5.35M
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
5.35M
    const uint8_t* in_pos = in;
84
5.35M
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
26.8M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
21.5M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
21.5M
        out_pos += BATCH_SIZE;
90
21.5M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
21.5M
    }
92
93
    // Then unpack the final partial batch.
94
5.35M
    if (remainder_values > 0) {
95
3.10M
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
3.10M
                                                        out_pos);
97
3.10M
    }
98
5.35M
    return std::make_pair(in_pos, values_to_read);
99
5.35M
}
_ZN5doris10BitPacking12UnpackValuesIjLi0EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
2.36k
                                                            OutType* __restrict__ out) {
77
2.36k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
2.36k
    constexpr int BATCH_SIZE = 32;
80
2.36k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
2.36k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
2.36k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
2.36k
    const uint8_t* in_pos = in;
84
2.36k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
2.36k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
0
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
0
        out_pos += BATCH_SIZE;
90
0
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
0
    }
92
93
    // Then unpack the final partial batch.
94
2.36k
    if (remainder_values > 0) {
95
2.36k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
2.36k
                                                        out_pos);
97
2.36k
    }
98
2.36k
    return std::make_pair(in_pos, values_to_read);
99
2.36k
}
_ZN5doris10BitPacking12UnpackValuesIjLi1EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
1.29M
                                                            OutType* __restrict__ out) {
77
1.29M
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
1.29M
    constexpr int BATCH_SIZE = 32;
80
1.29M
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
1.29M
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
1.29M
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
1.29M
    const uint8_t* in_pos = in;
84
1.29M
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
1.70M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
418k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
418k
        out_pos += BATCH_SIZE;
90
418k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
418k
    }
92
93
    // Then unpack the final partial batch.
94
1.29M
    if (remainder_values > 0) {
95
964k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
964k
                                                        out_pos);
97
964k
    }
98
1.29M
    return std::make_pair(in_pos, values_to_read);
99
1.29M
}
_ZN5doris10BitPacking12UnpackValuesIjLi2EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
1.13M
                                                            OutType* __restrict__ out) {
77
1.13M
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
1.13M
    constexpr int BATCH_SIZE = 32;
80
1.13M
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
1.13M
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
1.13M
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
1.13M
    const uint8_t* in_pos = in;
84
1.13M
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
2.84M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
1.70M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
1.70M
        out_pos += BATCH_SIZE;
90
1.70M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
1.70M
    }
92
93
    // Then unpack the final partial batch.
94
1.13M
    if (remainder_values > 0) {
95
647k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
647k
                                                        out_pos);
97
647k
    }
98
1.13M
    return std::make_pair(in_pos, values_to_read);
99
1.13M
}
_ZN5doris10BitPacking12UnpackValuesIjLi3EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
210k
                                                            OutType* __restrict__ out) {
77
210k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
210k
    constexpr int BATCH_SIZE = 32;
80
210k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
210k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
210k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
210k
    const uint8_t* in_pos = in;
84
210k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
1.64M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
1.43M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
1.43M
        out_pos += BATCH_SIZE;
90
1.43M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
1.43M
    }
92
93
    // Then unpack the final partial batch.
94
210k
    if (remainder_values > 0) {
95
113k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
113k
                                                        out_pos);
97
113k
    }
98
210k
    return std::make_pair(in_pos, values_to_read);
99
210k
}
_ZN5doris10BitPacking12UnpackValuesIjLi4EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
769k
                                                            OutType* __restrict__ out) {
77
769k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
769k
    constexpr int BATCH_SIZE = 32;
80
769k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
769k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
769k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
769k
    const uint8_t* in_pos = in;
84
769k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
5.76M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
4.99M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
4.99M
        out_pos += BATCH_SIZE;
90
4.99M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
4.99M
    }
92
93
    // Then unpack the final partial batch.
94
769k
    if (remainder_values > 0) {
95
401k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
401k
                                                        out_pos);
97
401k
    }
98
769k
    return std::make_pair(in_pos, values_to_read);
99
769k
}
_ZN5doris10BitPacking12UnpackValuesIjLi5EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
2.94k
                                                            OutType* __restrict__ out) {
77
2.94k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
2.94k
    constexpr int BATCH_SIZE = 32;
80
2.94k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
2.94k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
2.94k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
2.94k
    const uint8_t* in_pos = in;
84
2.94k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
14.1k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
11.1k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
11.1k
        out_pos += BATCH_SIZE;
90
11.1k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
11.1k
    }
92
93
    // Then unpack the final partial batch.
94
2.94k
    if (remainder_values > 0) {
95
1.76k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
1.76k
                                                        out_pos);
97
1.76k
    }
98
2.94k
    return std::make_pair(in_pos, values_to_read);
99
2.94k
}
_ZN5doris10BitPacking12UnpackValuesIjLi6EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
455k
                                                            OutType* __restrict__ out) {
77
455k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
455k
    constexpr int BATCH_SIZE = 32;
80
455k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
455k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
455k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
455k
    const uint8_t* in_pos = in;
84
455k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
3.72M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
3.26M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
3.26M
        out_pos += BATCH_SIZE;
90
3.26M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
3.26M
    }
92
93
    // Then unpack the final partial batch.
94
455k
    if (remainder_values > 0) {
95
222k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
222k
                                                        out_pos);
97
222k
    }
98
455k
    return std::make_pair(in_pos, values_to_read);
99
455k
}
_ZN5doris10BitPacking12UnpackValuesIjLi7EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
13.6k
                                                            OutType* __restrict__ out) {
77
13.6k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
13.6k
    constexpr int BATCH_SIZE = 32;
80
13.6k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
13.6k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
13.6k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
13.6k
    const uint8_t* in_pos = in;
84
13.6k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
50.9k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
37.3k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
37.3k
        out_pos += BATCH_SIZE;
90
37.3k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
37.3k
    }
92
93
    // Then unpack the final partial batch.
94
13.6k
    if (remainder_values > 0) {
95
7.25k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
7.25k
                                                        out_pos);
97
7.25k
    }
98
13.6k
    return std::make_pair(in_pos, values_to_read);
99
13.6k
}
_ZN5doris10BitPacking12UnpackValuesIjLi8EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
9.49k
                                                            OutType* __restrict__ out) {
77
9.49k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
9.49k
    constexpr int BATCH_SIZE = 32;
80
9.49k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
9.49k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
9.49k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
9.49k
    const uint8_t* in_pos = in;
84
9.49k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
29.9k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
20.4k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
20.4k
        out_pos += BATCH_SIZE;
90
20.4k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
20.4k
    }
92
93
    // Then unpack the final partial batch.
94
9.49k
    if (remainder_values > 0) {
95
7.80k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
7.80k
                                                        out_pos);
97
7.80k
    }
98
9.49k
    return std::make_pair(in_pos, values_to_read);
99
9.49k
}
_ZN5doris10BitPacking12UnpackValuesIjLi9EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
17.4k
                                                            OutType* __restrict__ out) {
77
17.4k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
17.4k
    constexpr int BATCH_SIZE = 32;
80
17.4k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
17.4k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
17.4k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
17.4k
    const uint8_t* in_pos = in;
84
17.4k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
55.5k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
38.1k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
38.1k
        out_pos += BATCH_SIZE;
90
38.1k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
38.1k
    }
92
93
    // Then unpack the final partial batch.
94
17.4k
    if (remainder_values > 0) {
95
13.1k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
13.1k
                                                        out_pos);
97
13.1k
    }
98
17.4k
    return std::make_pair(in_pos, values_to_read);
99
17.4k
}
_ZN5doris10BitPacking12UnpackValuesIjLi10EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
14.7k
                                                            OutType* __restrict__ out) {
77
14.7k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
14.7k
    constexpr int BATCH_SIZE = 32;
80
14.7k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
14.7k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
14.7k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
14.7k
    const uint8_t* in_pos = in;
84
14.7k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
51.2k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
36.4k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
36.4k
        out_pos += BATCH_SIZE;
90
36.4k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
36.4k
    }
92
93
    // Then unpack the final partial batch.
94
14.7k
    if (remainder_values > 0) {
95
11.5k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
11.5k
                                                        out_pos);
97
11.5k
    }
98
14.7k
    return std::make_pair(in_pos, values_to_read);
99
14.7k
}
_ZN5doris10BitPacking12UnpackValuesIjLi11EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
79.4k
                                                            OutType* __restrict__ out) {
77
79.4k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
79.4k
    constexpr int BATCH_SIZE = 32;
80
79.4k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
79.4k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
79.4k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
79.4k
    const uint8_t* in_pos = in;
84
79.4k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
598k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
518k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
518k
        out_pos += BATCH_SIZE;
90
518k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
518k
    }
92
93
    // Then unpack the final partial batch.
94
79.4k
    if (remainder_values > 0) {
95
40.0k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
40.0k
                                                        out_pos);
97
40.0k
    }
98
79.4k
    return std::make_pair(in_pos, values_to_read);
99
79.4k
}
_ZN5doris10BitPacking12UnpackValuesIjLi12EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
535k
                                                            OutType* __restrict__ out) {
77
535k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
535k
    constexpr int BATCH_SIZE = 32;
80
535k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
535k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
535k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
535k
    const uint8_t* in_pos = in;
84
535k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
4.32M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
3.78M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
3.78M
        out_pos += BATCH_SIZE;
90
3.78M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
3.78M
    }
92
93
    // Then unpack the final partial batch.
94
535k
    if (remainder_values > 0) {
95
266k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
266k
                                                        out_pos);
97
266k
    }
98
535k
    return std::make_pair(in_pos, values_to_read);
99
535k
}
_ZN5doris10BitPacking12UnpackValuesIjLi13EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
74.5k
                                                            OutType* __restrict__ out) {
77
74.5k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
74.5k
    constexpr int BATCH_SIZE = 32;
80
74.5k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
74.5k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
74.5k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
74.5k
    const uint8_t* in_pos = in;
84
74.5k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
424k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
350k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
350k
        out_pos += BATCH_SIZE;
90
350k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
350k
    }
92
93
    // Then unpack the final partial batch.
94
74.5k
    if (remainder_values > 0) {
95
42.2k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
42.2k
                                                        out_pos);
97
42.2k
    }
98
74.5k
    return std::make_pair(in_pos, values_to_read);
99
74.5k
}
_ZN5doris10BitPacking12UnpackValuesIjLi14EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
330k
                                                            OutType* __restrict__ out) {
77
330k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
330k
    constexpr int BATCH_SIZE = 32;
80
330k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
330k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
330k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
330k
    const uint8_t* in_pos = in;
84
330k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
2.58M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
2.25M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
2.25M
        out_pos += BATCH_SIZE;
90
2.25M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
2.25M
    }
92
93
    // Then unpack the final partial batch.
94
330k
    if (remainder_values > 0) {
95
162k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
162k
                                                        out_pos);
97
162k
    }
98
330k
    return std::make_pair(in_pos, values_to_read);
99
330k
}
_ZN5doris10BitPacking12UnpackValuesIjLi15EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
72.5k
                                                            OutType* __restrict__ out) {
77
72.5k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
72.5k
    constexpr int BATCH_SIZE = 32;
80
72.5k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
72.5k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
72.5k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
72.5k
    const uint8_t* in_pos = in;
84
72.5k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
442k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
369k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
369k
        out_pos += BATCH_SIZE;
90
369k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
369k
    }
92
93
    // Then unpack the final partial batch.
94
72.5k
    if (remainder_values > 0) {
95
37.6k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
37.6k
                                                        out_pos);
97
37.6k
    }
98
72.5k
    return std::make_pair(in_pos, values_to_read);
99
72.5k
}
_ZN5doris10BitPacking12UnpackValuesIjLi16EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
100k
                                                            OutType* __restrict__ out) {
77
100k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
100k
    constexpr int BATCH_SIZE = 32;
80
100k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
100k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
100k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
100k
    const uint8_t* in_pos = in;
84
100k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
761k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
660k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
660k
        out_pos += BATCH_SIZE;
90
660k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
660k
    }
92
93
    // Then unpack the final partial batch.
94
100k
    if (remainder_values > 0) {
95
49.9k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
49.9k
                                                        out_pos);
97
49.9k
    }
98
100k
    return std::make_pair(in_pos, values_to_read);
99
100k
}
_ZN5doris10BitPacking12UnpackValuesIjLi17EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
168k
                                                            OutType* __restrict__ out) {
77
168k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
168k
    constexpr int BATCH_SIZE = 32;
80
168k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
168k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
168k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
168k
    const uint8_t* in_pos = in;
84
168k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
1.37M
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
1.21M
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
1.21M
        out_pos += BATCH_SIZE;
90
1.21M
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
1.21M
    }
92
93
    // Then unpack the final partial batch.
94
168k
    if (remainder_values > 0) {
95
82.8k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
82.8k
                                                        out_pos);
97
82.8k
    }
98
168k
    return std::make_pair(in_pos, values_to_read);
99
168k
}
_ZN5doris10BitPacking12UnpackValuesIjLi18EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
53.5k
                                                            OutType* __restrict__ out) {
77
53.5k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
53.5k
    constexpr int BATCH_SIZE = 32;
80
53.5k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
53.5k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
53.5k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
53.5k
    const uint8_t* in_pos = in;
84
53.5k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
435k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
381k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
381k
        out_pos += BATCH_SIZE;
90
381k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
381k
    }
92
93
    // Then unpack the final partial batch.
94
53.5k
    if (remainder_values > 0) {
95
25.7k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
25.7k
                                                        out_pos);
97
25.7k
    }
98
53.5k
    return std::make_pair(in_pos, values_to_read);
99
53.5k
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi19EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi20EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi21EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi22EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi23EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi24EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi25EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi26EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi27EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi28EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi29EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi30EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi31EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi32EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi33EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi34EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi35EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi36EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi37EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi38EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi39EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi40EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi41EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi42EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi43EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi44EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi45EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi46EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi47EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi48EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi49EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi50EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi51EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi52EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi53EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi54EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi55EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi56EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi57EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi58EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi59EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi60EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi61EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi62EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi63EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi64EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi0EEESt4pairIPKhlES4_llPT_
_ZN5doris10BitPacking12UnpackValuesIhLi1EEESt4pairIPKhlES4_llPT_
Line
Count
Source
76
9.61k
                                                            OutType* __restrict__ out) {
77
9.61k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
78
79
9.61k
    constexpr int BATCH_SIZE = 32;
80
9.61k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
81
9.61k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
82
9.61k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
83
9.61k
    const uint8_t* in_pos = in;
84
9.61k
    OutType* out_pos = out;
85
86
    // First unpack as many full batches as possible.
87
25.6k
    for (int64_t i = 0; i < batches_to_read; ++i) {
88
15.9k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
89
15.9k
        out_pos += BATCH_SIZE;
90
15.9k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
91
15.9k
    }
92
93
    // Then unpack the final partial batch.
94
9.61k
    if (remainder_values > 0) {
95
4.47k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
96
4.47k
                                                        out_pos);
97
4.47k
    }
98
9.61k
    return std::make_pair(in_pos, values_to_read);
99
9.61k
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi2EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi3EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi4EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi5EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi6EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi7EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi8EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi9EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi10EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi11EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi12EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi13EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi14EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi15EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi16EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi17EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi18EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi19EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi20EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi21EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi22EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi23EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi24EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi25EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi26EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi27EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi28EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi29EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi30EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi31EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi32EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi33EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi34EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi35EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi36EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi37EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi38EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi39EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi40EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi41EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi42EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi43EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi44EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi45EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi46EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi47EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi48EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi49EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi50EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi51EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi52EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi53EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi54EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi55EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi56EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi57EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi58EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi59EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi60EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi61EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi62EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi63EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi64EEESt4pairIPKhlES4_llPT_
100
101
template <typename OutType>
102
std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
103
        int bit_width, const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
104
        int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
105
        bool* __restrict__ decode_error) {
106
#pragma push_macro("UNPACK_VALUES_CASE")
107
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                                 \
108
    case i:                                                                                     \
109
        return UnpackAndDecodeValues<OutType, i>(in, in_bytes, dict, dict_len, num_values, out, \
110
                                                 stride, decode_error);
111
112
    switch (bit_width) {
113
        // Expand cases from 0 to MAX_DICT_BITWIDTH.
114
        BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
115
    default:
116
        DCHECK(false);
117
        return std::make_pair(nullptr, -1);
118
    }
119
#pragma pop_macro("UNPACK_VALUES_CASE")
120
}
121
template <typename OutType, int BIT_WIDTH>
122
std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
123
        const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
124
        int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
125
        bool* __restrict__ decode_error) {
126
    constexpr int BATCH_SIZE = 32;
127
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
128
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
129
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
130
    const uint8_t* in_pos = in;
131
    uint8_t* out_pos = reinterpret_cast<uint8_t*>(out);
132
    // First unpack as many full batches as possible.
133
    for (int64_t i = 0; i < batches_to_read; ++i) {
134
        in_pos = UnpackAndDecode32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, dict, dict_len,
135
                                                             reinterpret_cast<OutType*>(out_pos),
136
                                                             stride, decode_error);
137
        out_pos += stride * BATCH_SIZE;
138
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
139
    }
140
    // Then unpack the final partial batch.
141
    if (remainder_values > 0) {
142
        in_pos = UnpackAndDecodeUpTo31Values<OutType, BIT_WIDTH>(
143
                in_pos, in_bytes, dict, dict_len, remainder_values,
144
                reinterpret_cast<OutType*>(out_pos), stride, decode_error);
145
    }
146
    return std::make_pair(in_pos, values_to_read);
147
}
148
149
// Loop body of unrolled loop that unpacks the value. BIT_WIDTH is the bit width of
150
// the packed values. 'in_buf' is the start of the input buffer and 'out_vals' is the
151
// start of the output values array. This function unpacks the VALUE_IDX'th packed value
152
// from 'in_buf'.
153
//
154
// This implements essentially the same algorithm as the (Apache-licensed) code in
155
// bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more compact
156
// because it uses templates rather than source-level unrolling of all combinations.
157
//
158
// After the template parameters are expanded and constants are propagated, all branches
159
// and offset/shift calculations should be optimized out, leaving only shifts by constants
160
// and bitmasks by constants. Calls to this must be stamped out manually or with
161
// BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 optimiser was
162
// not able to fully propagate constants and remove branches when this was called from
163
// inside a for loop with constant bounds with VALUE_IDX changed to a function argument.
164
//
165
// We compute how many 32 bit words we have to read, which is either 1, 2 or 3. If it is
166
// at least 2, the first two 32 bit words are read as one 64 bit word. Even if only one
167
// word needs to be read, we try to read 64 bits if it does not lead to buffer overflow
168
// because benchmarks show that it has a positive effect on performance.
169
//
170
// If 'FULL_BATCH' is true, this function call is part of unpacking 32 values, otherwise
171
// up to 31 values. This is needed to optimise the length of the reads (32 or 64 bits) and
172
// avoid buffer overflow (if we are unpacking 32 values, we can safely assume an input
173
// buffer of length 32 * BIT_WIDTH).
174
template <int BIT_WIDTH, int VALUE_IDX, bool FULL_BATCH>
175
745M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
745M
    if (BIT_WIDTH == 0) return 0;
177
178
745M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
745M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
745M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
745M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
745M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
745M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
745M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
745M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
745M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
745M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
745M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
745M
    constexpr bool READ_32_BITS =
202
745M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
745M
    if (READ_32_BITS) {
205
348M
        uint32_t word = in[FIRST_WORD_IDX];
206
348M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
348M
        return word & mask;
208
348M
    }
209
210
397M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
397M
    word >>= FIRST_BIT_OFFSET;
212
213
397M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
397M
    return word & mask;
220
745M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb0EEEmPKh
_ZN5doris11UnpackValueILi0ELi7ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi6ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi5ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi4ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi3ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi2ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi1ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi0ELi0ELb0EEEmPKh
Line
Count
Source
175
2.36k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.36k
    if (BIT_WIDTH == 0) return 0;
177
178
0
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
0
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
0
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
0
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
0
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
0
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
0
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
0
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
0
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
0
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
0
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
0
    constexpr bool READ_32_BITS =
202
0
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
0
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
0
}
_ZN5doris11UnpackValueILi1ELi0ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
142
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
142
    word >>= FIRST_BIT_OFFSET;
212
213
142
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
142
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi1ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
434k
        uint32_t word = in[FIRST_WORD_IDX];
206
434k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
434k
        return word & mask;
208
434k
    }
209
210
64
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
64
    word >>= FIRST_BIT_OFFSET;
212
213
64
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
64
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi2ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
26
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
26
    word >>= FIRST_BIT_OFFSET;
212
213
26
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
26
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi3ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
434k
        uint32_t word = in[FIRST_WORD_IDX];
206
434k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
434k
        return word & mask;
208
434k
    }
209
210
12
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
12
    word >>= FIRST_BIT_OFFSET;
212
213
12
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
12
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi4ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
434k
        uint32_t word = in[FIRST_WORD_IDX];
206
434k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
434k
        return word & mask;
208
434k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi5ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
434k
        uint32_t word = in[FIRST_WORD_IDX];
206
434k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
434k
        return word & mask;
208
434k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi6ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
434k
        uint32_t word = in[FIRST_WORD_IDX];
206
434k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
434k
        return word & mask;
208
434k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi7ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
434k
        uint32_t word = in[FIRST_WORD_IDX];
206
434k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
434k
        return word & mask;
208
434k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi8ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
180
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
180
    word >>= FIRST_BIT_OFFSET;
212
213
180
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
180
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi9ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
240
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
240
    word >>= FIRST_BIT_OFFSET;
212
213
240
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
240
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi10ELb1EEEmPKh
Line
Count
Source
175
434k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
434k
    if (BIT_WIDTH == 0) return 0;
177
178
434k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
434k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
434k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
434k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
434k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
434k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
434k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
434k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
434k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
434k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
434k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
434k
    constexpr bool READ_32_BITS =
202
434k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
434k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
276
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
276
    word >>= FIRST_BIT_OFFSET;
212
213
276
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
276
    return word & mask;
220
434k
}
_ZN5doris11UnpackValueILi1ELi11ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
170
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
170
    word >>= FIRST_BIT_OFFSET;
212
213
170
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
170
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi12ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
176
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
176
    word >>= FIRST_BIT_OFFSET;
212
213
176
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
176
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi13ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
114
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
114
    word >>= FIRST_BIT_OFFSET;
212
213
114
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
114
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi14ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
98
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
98
    word >>= FIRST_BIT_OFFSET;
212
213
98
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
98
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi15ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
68
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
68
    word >>= FIRST_BIT_OFFSET;
212
213
68
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
68
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi16ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
82
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82
    word >>= FIRST_BIT_OFFSET;
212
213
82
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi17ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
108
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
108
    word >>= FIRST_BIT_OFFSET;
212
213
108
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
108
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi18ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
132
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
132
    word >>= FIRST_BIT_OFFSET;
212
213
132
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
132
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi19ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
82
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82
    word >>= FIRST_BIT_OFFSET;
212
213
82
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi20ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
118
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
118
    word >>= FIRST_BIT_OFFSET;
212
213
118
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
118
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi21ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
100
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
100
    word >>= FIRST_BIT_OFFSET;
212
213
100
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
100
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi22ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
86
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
86
    word >>= FIRST_BIT_OFFSET;
212
213
86
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
86
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi23ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
58
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
58
    word >>= FIRST_BIT_OFFSET;
212
213
58
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
58
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi24ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
26
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
26
    word >>= FIRST_BIT_OFFSET;
212
213
26
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
26
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi25ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi26ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi27ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi28ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi29ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
16
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
16
    word >>= FIRST_BIT_OFFSET;
212
213
16
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
16
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi30ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
433k
}
_ZN5doris11UnpackValueILi1ELi31ELb1EEEmPKh
Line
Count
Source
175
433k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
433k
    if (BIT_WIDTH == 0) return 0;
177
178
433k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
433k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
433k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
433k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
433k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
433k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
433k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
433k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
433k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
433k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
433k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
433k
    constexpr bool READ_32_BITS =
202
433k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
433k
    if (READ_32_BITS) {
205
433k
        uint32_t word = in[FIRST_WORD_IDX];
206
433k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
433k
        return word & mask;
208
433k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
433k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi1ELi23ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi22ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi21ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi20ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
10
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
10
    word >>= FIRST_BIT_OFFSET;
212
213
10
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
10
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi19ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi18ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi17ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi16ELb0EEEmPKh
Line
Count
Source
175
199k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
199k
    if (BIT_WIDTH == 0) return 0;
177
178
199k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
199k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
199k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
199k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
199k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
199k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
199k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
199k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
199k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
199k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
199k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
199k
    constexpr bool READ_32_BITS =
202
199k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
199k
    if (READ_32_BITS) {
205
199k
        uint32_t word = in[FIRST_WORD_IDX];
206
199k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
199k
        return word & mask;
208
199k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
199k
}
_ZN5doris11UnpackValueILi1ELi15ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi14ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi13ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi12ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
50
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
50
    word >>= FIRST_BIT_OFFSET;
212
213
50
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
50
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi11ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
14
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
14
    word >>= FIRST_BIT_OFFSET;
212
213
14
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
14
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi10ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi9ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi8ELb0EEEmPKh
Line
Count
Source
175
493k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
493k
    if (BIT_WIDTH == 0) return 0;
177
178
493k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
493k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
493k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
493k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
493k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
493k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
493k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
493k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
493k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
493k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
493k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
493k
    constexpr bool READ_32_BITS =
202
493k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
493k
    if (READ_32_BITS) {
205
493k
        uint32_t word = in[FIRST_WORD_IDX];
206
493k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
493k
        return word & mask;
208
493k
    }
209
210
28
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
28
    word >>= FIRST_BIT_OFFSET;
212
213
28
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
28
    return word & mask;
220
493k
}
_ZN5doris11UnpackValueILi1ELi7ELb0EEEmPKh
Line
Count
Source
175
968k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
968k
    if (BIT_WIDTH == 0) return 0;
177
178
968k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
968k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
968k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
968k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
968k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
968k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
968k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
968k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
968k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
968k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
968k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
968k
    constexpr bool READ_32_BITS =
202
968k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
968k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
146
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
146
    word >>= FIRST_BIT_OFFSET;
212
213
146
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
146
    return word & mask;
220
968k
}
_ZN5doris11UnpackValueILi1ELi6ELb0EEEmPKh
Line
Count
Source
175
967k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
967k
    if (BIT_WIDTH == 0) return 0;
177
178
967k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
967k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
967k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
967k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
967k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
967k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
967k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
967k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
967k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
967k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
967k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
967k
    constexpr bool READ_32_BITS =
202
967k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
967k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
102
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
102
    word >>= FIRST_BIT_OFFSET;
212
213
102
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
102
    return word & mask;
220
967k
}
_ZN5doris11UnpackValueILi1ELi5ELb0EEEmPKh
Line
Count
Source
175
968k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
968k
    if (BIT_WIDTH == 0) return 0;
177
178
968k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
968k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
968k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
968k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
968k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
968k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
968k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
968k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
968k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
968k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
968k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
968k
    constexpr bool READ_32_BITS =
202
968k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
968k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
78
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
78
    word >>= FIRST_BIT_OFFSET;
212
213
78
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
78
    return word & mask;
220
968k
}
_ZN5doris11UnpackValueILi1ELi4ELb0EEEmPKh
Line
Count
Source
175
967k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
967k
    if (BIT_WIDTH == 0) return 0;
177
178
967k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
967k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
967k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
967k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
967k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
967k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
967k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
967k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
967k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
967k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
967k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
967k
    constexpr bool READ_32_BITS =
202
967k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
967k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
34
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
34
    word >>= FIRST_BIT_OFFSET;
212
213
34
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
34
    return word & mask;
220
967k
}
_ZN5doris11UnpackValueILi1ELi3ELb0EEEmPKh
Line
Count
Source
175
967k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
967k
    if (BIT_WIDTH == 0) return 0;
177
178
967k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
967k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
967k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
967k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
967k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
967k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
967k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
967k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
967k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
967k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
967k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
967k
    constexpr bool READ_32_BITS =
202
967k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
967k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
28
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
28
    word >>= FIRST_BIT_OFFSET;
212
213
28
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
28
    return word & mask;
220
967k
}
_ZN5doris11UnpackValueILi1ELi2ELb0EEEmPKh
Line
Count
Source
175
967k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
967k
    if (BIT_WIDTH == 0) return 0;
177
178
967k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
967k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
967k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
967k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
967k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
967k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
967k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
967k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
967k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
967k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
967k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
967k
    constexpr bool READ_32_BITS =
202
967k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
967k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
30
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
30
    word >>= FIRST_BIT_OFFSET;
212
213
30
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
30
    return word & mask;
220
967k
}
_ZN5doris11UnpackValueILi1ELi1ELb0EEEmPKh
Line
Count
Source
175
967k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
967k
    if (BIT_WIDTH == 0) return 0;
177
178
967k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
967k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
967k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
967k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
967k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
967k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
967k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
967k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
967k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
967k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
967k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
967k
    constexpr bool READ_32_BITS =
202
967k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
967k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
26
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
26
    word >>= FIRST_BIT_OFFSET;
212
213
26
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
26
    return word & mask;
220
967k
}
_ZN5doris11UnpackValueILi1ELi0ELb0EEEmPKh
Line
Count
Source
175
967k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
967k
    if (BIT_WIDTH == 0) return 0;
177
178
967k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
967k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
967k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
967k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
967k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
967k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
967k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
967k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
967k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
967k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
967k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
967k
    constexpr bool READ_32_BITS =
202
967k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
967k
    if (READ_32_BITS) {
205
967k
        uint32_t word = in[FIRST_WORD_IDX];
206
967k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
967k
        return word & mask;
208
967k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
967k
}
_ZN5doris11UnpackValueILi2ELi0ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
824
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
824
    word >>= FIRST_BIT_OFFSET;
212
213
824
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
824
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi1ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
138
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
138
    word >>= FIRST_BIT_OFFSET;
212
213
138
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
138
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi2ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.35k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.35k
    word >>= FIRST_BIT_OFFSET;
212
213
1.35k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.35k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi3ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.38k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.38k
    word >>= FIRST_BIT_OFFSET;
212
213
1.38k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.38k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi4ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
2.16k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.16k
    word >>= FIRST_BIT_OFFSET;
212
213
2.16k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.16k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi5ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.87k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.87k
    word >>= FIRST_BIT_OFFSET;
212
213
1.87k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.87k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi6ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
494
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
494
    word >>= FIRST_BIT_OFFSET;
212
213
494
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
494
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi7ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi8ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.17k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.17k
    word >>= FIRST_BIT_OFFSET;
212
213
1.17k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.17k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi9ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.36k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.36k
    word >>= FIRST_BIT_OFFSET;
212
213
1.36k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.36k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi10ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.97k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.97k
    word >>= FIRST_BIT_OFFSET;
212
213
1.97k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.97k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi11ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.70k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.70k
    word >>= FIRST_BIT_OFFSET;
212
213
1.70k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.70k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi12ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.78k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.78k
    word >>= FIRST_BIT_OFFSET;
212
213
1.78k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.78k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi13ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.30k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.30k
    word >>= FIRST_BIT_OFFSET;
212
213
1.30k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.30k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi14ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.10k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.10k
    word >>= FIRST_BIT_OFFSET;
212
213
1.10k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.10k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi15ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.20k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.20k
    word >>= FIRST_BIT_OFFSET;
212
213
1.20k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.20k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi16ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi17ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi18ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
524
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
524
    word >>= FIRST_BIT_OFFSET;
212
213
524
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
524
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi19ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
548
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
548
    word >>= FIRST_BIT_OFFSET;
212
213
548
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
548
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi20ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
714
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
714
    word >>= FIRST_BIT_OFFSET;
212
213
714
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
714
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi21ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
818
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
818
    word >>= FIRST_BIT_OFFSET;
212
213
818
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
818
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi22ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
1.22k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.22k
    word >>= FIRST_BIT_OFFSET;
212
213
1.22k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.22k
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi23ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
106
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
106
    word >>= FIRST_BIT_OFFSET;
212
213
106
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
106
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi24ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
52
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
52
    word >>= FIRST_BIT_OFFSET;
212
213
52
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
52
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi25ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
48
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
48
    word >>= FIRST_BIT_OFFSET;
212
213
48
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
48
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi26ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi27ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
782
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
782
    word >>= FIRST_BIT_OFFSET;
212
213
782
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
782
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi28ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
774
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
774
    word >>= FIRST_BIT_OFFSET;
212
213
774
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
774
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi29ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
358
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
358
    word >>= FIRST_BIT_OFFSET;
212
213
358
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
358
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi30ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
664
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
664
    word >>= FIRST_BIT_OFFSET;
212
213
664
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
664
    return word & mask;
220
1.70M
}
_ZN5doris11UnpackValueILi2ELi31ELb1EEEmPKh
Line
Count
Source
175
1.70M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.70M
    if (BIT_WIDTH == 0) return 0;
177
178
1.70M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.70M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.70M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.70M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.70M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.70M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.70M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.70M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.70M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.70M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.70M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.70M
    constexpr bool READ_32_BITS =
202
1.70M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.70M
    if (READ_32_BITS) {
205
1.70M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.70M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.70M
        return word & mask;
208
1.70M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
1.70M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi2ELi23ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi22ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
20
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
20
    word >>= FIRST_BIT_OFFSET;
212
213
20
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
20
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi21ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi20ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi19ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi18ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi17ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi16ELb0EEEmPKh
Line
Count
Source
175
213k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
213k
    if (BIT_WIDTH == 0) return 0;
177
178
213k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
213k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
213k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
213k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
213k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
213k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
213k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
213k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
213k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
213k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
213k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
213k
    constexpr bool READ_32_BITS =
202
213k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
213k
    if (READ_32_BITS) {
205
213k
        uint32_t word = in[FIRST_WORD_IDX];
206
213k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
213k
        return word & mask;
208
213k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
213k
}
_ZN5doris11UnpackValueILi2ELi15ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
212
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
212
    word >>= FIRST_BIT_OFFSET;
212
213
212
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
212
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi14ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi13ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
44
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
44
    word >>= FIRST_BIT_OFFSET;
212
213
44
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
44
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi12ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi11ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi10ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
12
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
12
    word >>= FIRST_BIT_OFFSET;
212
213
12
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
12
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi9ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi8ELb0EEEmPKh
Line
Count
Source
175
398k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
398k
    if (BIT_WIDTH == 0) return 0;
177
178
398k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
398k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
398k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
398k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
398k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
398k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
398k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
398k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
398k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
398k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
398k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
398k
    constexpr bool READ_32_BITS =
202
398k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
398k
    if (READ_32_BITS) {
205
398k
        uint32_t word = in[FIRST_WORD_IDX];
206
398k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
398k
        return word & mask;
208
398k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
398k
}
_ZN5doris11UnpackValueILi2ELi7ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi6ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
34
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
34
    word >>= FIRST_BIT_OFFSET;
212
213
34
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
34
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi5ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
18
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18
    word >>= FIRST_BIT_OFFSET;
212
213
18
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi4ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
32
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
32
    word >>= FIRST_BIT_OFFSET;
212
213
32
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
32
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi3ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi2ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
22
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
22
    word >>= FIRST_BIT_OFFSET;
212
213
22
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
22
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi1ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
28
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
28
    word >>= FIRST_BIT_OFFSET;
212
213
28
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
28
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi2ELi0ELb0EEEmPKh
Line
Count
Source
175
647k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
647k
    if (BIT_WIDTH == 0) return 0;
177
178
647k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
647k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
647k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
647k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
647k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
647k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
647k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
647k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
647k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
647k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
647k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
647k
    constexpr bool READ_32_BITS =
202
647k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
647k
    if (READ_32_BITS) {
205
647k
        uint32_t word = in[FIRST_WORD_IDX];
206
647k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
647k
        return word & mask;
208
647k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
647k
}
_ZN5doris11UnpackValueILi3ELi0ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi1ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi2ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi3ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi4ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi5ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi6ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi7ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi8ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi9ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi10ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi11ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi12ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi13ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi14ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi15ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi16ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi17ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi18ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi19ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi20ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi21ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.43M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.43M
    word >>= FIRST_BIT_OFFSET;
212
213
1.43M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.43M
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi22ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
24
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
24
    word >>= FIRST_BIT_OFFSET;
212
213
24
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
24
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi23ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
148
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
148
    word >>= FIRST_BIT_OFFSET;
212
213
148
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
148
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi24ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
1.13k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.13k
    word >>= FIRST_BIT_OFFSET;
212
213
1.13k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.13k
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi25ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
62
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
62
    word >>= FIRST_BIT_OFFSET;
212
213
62
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
62
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi26ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
138
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
138
    word >>= FIRST_BIT_OFFSET;
212
213
138
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
138
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi27ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
1.89k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.89k
    word >>= FIRST_BIT_OFFSET;
212
213
1.89k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.89k
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi28ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
78
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
78
    word >>= FIRST_BIT_OFFSET;
212
213
78
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
78
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi29ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
108
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
108
    word >>= FIRST_BIT_OFFSET;
212
213
108
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
108
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi30ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
1.68k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.68k
    word >>= FIRST_BIT_OFFSET;
212
213
1.68k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.68k
    return word & mask;
220
1.43M
}
_ZN5doris11UnpackValueILi3ELi31ELb1EEEmPKh
Line
Count
Source
175
1.43M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.43M
    if (BIT_WIDTH == 0) return 0;
177
178
1.43M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.43M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.43M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.43M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.43M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.43M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.43M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.43M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.43M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.43M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.43M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.43M
    constexpr bool READ_32_BITS =
202
1.43M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.43M
    if (READ_32_BITS) {
205
1.43M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.43M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.43M
        return word & mask;
208
1.43M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
1.43M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi3ELi23ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.9k
        return word & mask;
208
96.9k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi22ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.9k
        return word & mask;
208
96.9k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi21ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
96.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
96.9k
    word >>= FIRST_BIT_OFFSET;
212
213
96.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
96.9k
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi20ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.9k
        return word & mask;
208
96.9k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi19ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.9k
        return word & mask;
208
96.9k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi18ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.9k
        return word & mask;
208
96.9k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi17ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.8k
        return word & mask;
208
96.8k
    }
209
210
48
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
48
    word >>= FIRST_BIT_OFFSET;
212
213
48
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
48
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi16ELb0EEEmPKh
Line
Count
Source
175
96.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
96.9k
    if (BIT_WIDTH == 0) return 0;
177
178
96.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
96.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
96.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
96.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
96.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
96.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
96.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
96.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
96.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
96.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
96.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
96.9k
    constexpr bool READ_32_BITS =
202
96.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
96.9k
    if (READ_32_BITS) {
205
96.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
96.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
96.9k
        return word & mask;
208
96.9k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
96.9k
}
_ZN5doris11UnpackValueILi3ELi15ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi14ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi13ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi12ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi11ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi10ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
100k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
100k
    word >>= FIRST_BIT_OFFSET;
212
213
100k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
100k
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi9ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi8ELb0EEEmPKh
Line
Count
Source
175
100k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
100k
    if (BIT_WIDTH == 0) return 0;
177
178
100k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
100k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
100k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
100k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
100k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
100k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
100k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
100k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
100k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
100k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
100k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
100k
    constexpr bool READ_32_BITS =
202
100k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
100k
    if (READ_32_BITS) {
205
100k
        uint32_t word = in[FIRST_WORD_IDX];
206
100k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
100k
        return word & mask;
208
100k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
100k
}
_ZN5doris11UnpackValueILi3ELi7ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi6ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi5ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi4ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi3ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi2ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi1ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
113k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi3ELi0ELb0EEEmPKh
Line
Count
Source
175
113k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
113k
    if (BIT_WIDTH == 0) return 0;
177
178
113k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
113k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
113k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
113k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
113k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
113k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
113k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
113k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
113k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
113k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
113k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
113k
    constexpr bool READ_32_BITS =
202
113k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
113k
    if (READ_32_BITS) {
205
113k
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
113k
        return word & mask;
208
113k
    }
209
210
78
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
78
    word >>= FIRST_BIT_OFFSET;
212
213
78
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
78
    return word & mask;
220
113k
}
_ZN5doris11UnpackValueILi4ELi0ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
672
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
672
    word >>= FIRST_BIT_OFFSET;
212
213
672
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
672
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi1ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
1.25k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.25k
    word >>= FIRST_BIT_OFFSET;
212
213
1.25k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.25k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi2ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
1.95k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.95k
    word >>= FIRST_BIT_OFFSET;
212
213
1.95k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.95k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi3ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi4ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
968
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
968
    word >>= FIRST_BIT_OFFSET;
212
213
968
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
968
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi5ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi6ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
334
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
334
    word >>= FIRST_BIT_OFFSET;
212
213
334
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
334
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi7ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
804
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
804
    word >>= FIRST_BIT_OFFSET;
212
213
804
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
804
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi8ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi9ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
3.86k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.86k
    word >>= FIRST_BIT_OFFSET;
212
213
3.86k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.86k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi10ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
4.54k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4.54k
    word >>= FIRST_BIT_OFFSET;
212
213
4.54k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4.54k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi11ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
6.70k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6.70k
    word >>= FIRST_BIT_OFFSET;
212
213
6.70k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6.70k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi12ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
4.17k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4.17k
    word >>= FIRST_BIT_OFFSET;
212
213
4.17k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4.17k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi13ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4.00k
    word >>= FIRST_BIT_OFFSET;
212
213
4.00k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4.00k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi14ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
3.15k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.15k
    word >>= FIRST_BIT_OFFSET;
212
213
3.15k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.15k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi15ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
3.11k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.11k
    word >>= FIRST_BIT_OFFSET;
212
213
3.11k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.11k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi16ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
804
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
804
    word >>= FIRST_BIT_OFFSET;
212
213
804
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
804
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi17ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
1.24k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.24k
    word >>= FIRST_BIT_OFFSET;
212
213
1.24k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.24k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi18ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
3.58k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.58k
    word >>= FIRST_BIT_OFFSET;
212
213
3.58k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.58k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi19ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
4.27k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4.27k
    word >>= FIRST_BIT_OFFSET;
212
213
4.27k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4.27k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi20ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
4.40k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4.40k
    word >>= FIRST_BIT_OFFSET;
212
213
4.40k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4.40k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi21ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
2.95k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.95k
    word >>= FIRST_BIT_OFFSET;
212
213
2.95k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.95k
    return word & mask;
220
4.99M
}
_ZN5doris11UnpackValueILi4ELi22ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
2.96k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.96k
    word >>= FIRST_BIT_OFFSET;
212
213
2.96k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.96k
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi23ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
1.35k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.35k
    word >>= FIRST_BIT_OFFSET;
212
213
1.35k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.35k
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi24ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
432
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
432
    word >>= FIRST_BIT_OFFSET;
212
213
432
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
432
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi25ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
1.29k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.29k
    word >>= FIRST_BIT_OFFSET;
212
213
1.29k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.29k
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi26ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
1.24k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.24k
    word >>= FIRST_BIT_OFFSET;
212
213
1.24k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.24k
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi27ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
134
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
134
    word >>= FIRST_BIT_OFFSET;
212
213
134
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
134
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi28ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi29ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
776
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
776
    word >>= FIRST_BIT_OFFSET;
212
213
776
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
776
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi30ELb1EEEmPKh
Line
Count
Source
175
4.98M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.98M
    if (BIT_WIDTH == 0) return 0;
177
178
4.98M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.98M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.98M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.98M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.98M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.98M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.98M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.98M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.98M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.98M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.98M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.98M
    constexpr bool READ_32_BITS =
202
4.98M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.98M
    if (READ_32_BITS) {
205
4.98M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.98M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.98M
        return word & mask;
208
4.98M
    }
209
210
492
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
492
    word >>= FIRST_BIT_OFFSET;
212
213
492
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
492
    return word & mask;
220
4.98M
}
_ZN5doris11UnpackValueILi4ELi31ELb1EEEmPKh
Line
Count
Source
175
4.99M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
4.99M
    if (BIT_WIDTH == 0) return 0;
177
178
4.99M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
4.99M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
4.99M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
4.99M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
4.99M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
4.99M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
4.99M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
4.99M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
4.99M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
4.99M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
4.99M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
4.99M
    constexpr bool READ_32_BITS =
202
4.99M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
4.99M
    if (READ_32_BITS) {
205
4.99M
        uint32_t word = in[FIRST_WORD_IDX];
206
4.99M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
4.99M
        return word & mask;
208
4.99M
    }
209
210
74
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
74
    word >>= FIRST_BIT_OFFSET;
212
213
74
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
74
    return word & mask;
220
4.99M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi4ELi23ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi22ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi21ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi20ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi19ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi18ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi17ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi16ELb0EEEmPKh
Line
Count
Source
175
370k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
370k
    if (BIT_WIDTH == 0) return 0;
177
178
370k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
370k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
370k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
370k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
370k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
370k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
370k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
370k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
370k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
370k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
370k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
370k
    constexpr bool READ_32_BITS =
202
370k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
370k
    if (READ_32_BITS) {
205
370k
        uint32_t word = in[FIRST_WORD_IDX];
206
370k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
370k
        return word & mask;
208
370k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
370k
}
_ZN5doris11UnpackValueILi4ELi15ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi14ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi13ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi12ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi11ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi10ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi9ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi8ELb0EEEmPKh
Line
Count
Source
175
400k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
400k
    if (BIT_WIDTH == 0) return 0;
177
178
400k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
400k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
400k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
400k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
400k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
400k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
400k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
400k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
400k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
400k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
400k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
400k
    constexpr bool READ_32_BITS =
202
400k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
400k
    if (READ_32_BITS) {
205
400k
        uint32_t word = in[FIRST_WORD_IDX];
206
400k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
400k
        return word & mask;
208
400k
    }
209
210
34
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
34
    word >>= FIRST_BIT_OFFSET;
212
213
34
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
34
    return word & mask;
220
400k
}
_ZN5doris11UnpackValueILi4ELi7ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi6ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi5ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi4ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi3ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi2ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi1ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi4ELi0ELb0EEEmPKh
Line
Count
Source
175
401k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
401k
    if (BIT_WIDTH == 0) return 0;
177
178
401k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
401k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
401k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
401k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
401k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
401k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
401k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
401k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
401k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
401k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
401k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
401k
    constexpr bool READ_32_BITS =
202
401k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
401k
    if (READ_32_BITS) {
205
401k
        uint32_t word = in[FIRST_WORD_IDX];
206
401k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
401k
        return word & mask;
208
401k
    }
209
210
12
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
12
    word >>= FIRST_BIT_OFFSET;
212
213
12
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
12
    return word & mask;
220
401k
}
_ZN5doris11UnpackValueILi5ELi0ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi1ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi2ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi3ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi4ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi5ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi6ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi7ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi8ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi9ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi10ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi11ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi12ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi13ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi14ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi15ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi16ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi17ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi18ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi19ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi20ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi21ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi22ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi23ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi24ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi25ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.1k
    word >>= FIRST_BIT_OFFSET;
212
213
11.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.1k
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi26ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
11.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.1k
        return word & mask;
208
11.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi27ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
11.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.1k
        return word & mask;
208
11.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi28ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
11.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.1k
        return word & mask;
208
11.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi29ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
11.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.1k
        return word & mask;
208
11.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi30ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
11.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.1k
        return word & mask;
208
11.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.1k
}
_ZN5doris11UnpackValueILi5ELi31ELb1EEEmPKh
Line
Count
Source
175
11.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.1k
    if (BIT_WIDTH == 0) return 0;
177
178
11.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.1k
    constexpr bool READ_32_BITS =
202
11.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.1k
    if (READ_32_BITS) {
205
11.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.1k
        return word & mask;
208
11.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.1k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi5ELi23ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi22ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi21ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi20ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi19ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
716
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
716
    word >>= FIRST_BIT_OFFSET;
212
213
716
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
716
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi18ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi17ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi16ELb0EEEmPKh
Line
Count
Source
175
716
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
716
    if (BIT_WIDTH == 0) return 0;
177
178
716
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
716
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
716
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
716
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
716
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
716
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
716
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
716
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
716
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
716
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
716
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
716
    constexpr bool READ_32_BITS =
202
716
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
716
    if (READ_32_BITS) {
205
716
        uint32_t word = in[FIRST_WORD_IDX];
206
716
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
716
        return word & mask;
208
716
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
716
}
_ZN5doris11UnpackValueILi5ELi15ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi14ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi13ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi12ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
848
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
848
    word >>= FIRST_BIT_OFFSET;
212
213
848
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
848
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi11ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi10ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi9ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi8ELb0EEEmPKh
Line
Count
Source
175
848
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
848
    if (BIT_WIDTH == 0) return 0;
177
178
848
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
848
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
848
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
848
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
848
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
848
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
848
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
848
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
848
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
848
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
848
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
848
    constexpr bool READ_32_BITS =
202
848
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
848
    if (READ_32_BITS) {
205
848
        uint32_t word = in[FIRST_WORD_IDX];
206
848
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
848
        return word & mask;
208
848
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
848
}
_ZN5doris11UnpackValueILi5ELi7ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi6ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.76k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.76k
    word >>= FIRST_BIT_OFFSET;
212
213
1.76k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.76k
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi5ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi4ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi3ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi2ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi1ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi5ELi0ELb0EEEmPKh
Line
Count
Source
175
1.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.76k
    if (BIT_WIDTH == 0) return 0;
177
178
1.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.76k
    constexpr bool READ_32_BITS =
202
1.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.76k
    if (READ_32_BITS) {
205
1.76k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.76k
        return word & mask;
208
1.76k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.76k
}
_ZN5doris11UnpackValueILi6ELi0ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi1ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi2ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi3ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi4ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi5ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi6ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi7ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi8ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi9ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi10ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi11ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi12ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi13ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi14ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi15ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi16ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi17ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi18ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi19ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi20ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi21ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi22ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi23ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi24ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi25ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi26ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.26M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.26M
    word >>= FIRST_BIT_OFFSET;
212
213
3.26M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.26M
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi27ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
3.26M
        uint32_t word = in[FIRST_WORD_IDX];
206
3.26M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.26M
        return word & mask;
208
3.26M
    }
209
210
448
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
448
    word >>= FIRST_BIT_OFFSET;
212
213
448
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
448
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi28ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
3.26M
        uint32_t word = in[FIRST_WORD_IDX];
206
3.26M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.26M
        return word & mask;
208
3.26M
    }
209
210
484
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
484
    word >>= FIRST_BIT_OFFSET;
212
213
484
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
484
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi29ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
3.26M
        uint32_t word = in[FIRST_WORD_IDX];
206
3.26M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.26M
        return word & mask;
208
3.26M
    }
209
210
782
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
782
    word >>= FIRST_BIT_OFFSET;
212
213
782
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
782
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi30ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
3.26M
        uint32_t word = in[FIRST_WORD_IDX];
206
3.26M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.26M
        return word & mask;
208
3.26M
    }
209
210
2.51k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.51k
    word >>= FIRST_BIT_OFFSET;
212
213
2.51k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.51k
    return word & mask;
220
3.26M
}
_ZN5doris11UnpackValueILi6ELi31ELb1EEEmPKh
Line
Count
Source
175
3.26M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.26M
    if (BIT_WIDTH == 0) return 0;
177
178
3.26M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.26M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.26M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.26M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.26M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.26M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.26M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.26M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.26M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.26M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.26M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.26M
    constexpr bool READ_32_BITS =
202
3.26M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.26M
    if (READ_32_BITS) {
205
3.26M
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.26M
        return word & mask;
208
3.26M
    }
209
210
484
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
484
    word >>= FIRST_BIT_OFFSET;
212
213
484
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
484
    return word & mask;
220
3.26M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi6ELi23ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi22ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi21ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
220k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
220k
    word >>= FIRST_BIT_OFFSET;
212
213
220k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
220k
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi20ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi19ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi18ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi17ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi16ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi15ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi14ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi13ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi12ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
18
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18
    word >>= FIRST_BIT_OFFSET;
212
213
18
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi11ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi10ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
220k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
220k
    word >>= FIRST_BIT_OFFSET;
212
213
220k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
220k
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi9ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi8ELb0EEEmPKh
Line
Count
Source
175
220k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
220k
    if (BIT_WIDTH == 0) return 0;
177
178
220k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
220k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
220k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
220k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
220k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
220k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
220k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
220k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
220k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
220k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
220k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
220k
    constexpr bool READ_32_BITS =
202
220k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
220k
    if (READ_32_BITS) {
205
220k
        uint32_t word = in[FIRST_WORD_IDX];
206
220k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
220k
        return word & mask;
208
220k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
220k
}
_ZN5doris11UnpackValueILi6ELi7ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi6ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
222k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi5ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
222k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
222k
    word >>= FIRST_BIT_OFFSET;
212
213
222k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
222k
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi4ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
222k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi3ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
222k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi2ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
222k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi1ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
222k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi6ELi0ELb0EEEmPKh
Line
Count
Source
175
222k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
222k
    if (BIT_WIDTH == 0) return 0;
177
178
222k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
222k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
222k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
222k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
222k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
222k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
222k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
222k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
222k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
222k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
222k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
222k
    constexpr bool READ_32_BITS =
202
222k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
222k
    if (READ_32_BITS) {
205
222k
        uint32_t word = in[FIRST_WORD_IDX];
206
222k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
222k
        return word & mask;
208
222k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
222k
}
_ZN5doris11UnpackValueILi7ELi0ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi1ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi2ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi3ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi4ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi5ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi6ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi7ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi8ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi9ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi10ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi11ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi12ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi13ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi14ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi15ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi16ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi17ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi18ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi19ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi20ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi21ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi22ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi23ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi24ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi25ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi26ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi27ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.3k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.3k
    word >>= FIRST_BIT_OFFSET;
212
213
37.3k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.3k
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi28ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
37.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.3k
        return word & mask;
208
37.3k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi29ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
37.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.3k
        return word & mask;
208
37.3k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi30ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
37.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.3k
        return word & mask;
208
37.3k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.3k
}
_ZN5doris11UnpackValueILi7ELi31ELb1EEEmPKh
Line
Count
Source
175
37.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.3k
    if (BIT_WIDTH == 0) return 0;
177
178
37.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.3k
    constexpr bool READ_32_BITS =
202
37.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.3k
    if (READ_32_BITS) {
205
37.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.3k
        return word & mask;
208
37.3k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.3k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi7ELi23ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
1.67k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.67k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.67k
        return word & mask;
208
1.67k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi22ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.67k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.67k
    word >>= FIRST_BIT_OFFSET;
212
213
1.67k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.67k
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi21ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
1.67k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.67k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.67k
        return word & mask;
208
1.67k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi20ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
1.67k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.67k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.67k
        return word & mask;
208
1.67k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi19ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
1.67k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.67k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.67k
        return word & mask;
208
1.67k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi18ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.67k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.67k
    word >>= FIRST_BIT_OFFSET;
212
213
1.67k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.67k
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi17ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
1.67k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.67k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.67k
        return word & mask;
208
1.67k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi16ELb0EEEmPKh
Line
Count
Source
175
1.67k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.67k
    if (BIT_WIDTH == 0) return 0;
177
178
1.67k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.67k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.67k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.67k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.67k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.67k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.67k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.67k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.67k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.67k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.67k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.67k
    constexpr bool READ_32_BITS =
202
1.67k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.67k
    if (READ_32_BITS) {
205
1.67k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.67k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.67k
        return word & mask;
208
1.67k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.67k
}
_ZN5doris11UnpackValueILi7ELi15ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
2.24k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.24k
        return word & mask;
208
2.24k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi14ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
2.24k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.24k
        return word & mask;
208
2.24k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi13ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24k
    word >>= FIRST_BIT_OFFSET;
212
213
2.24k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24k
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi12ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
2.24k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.24k
        return word & mask;
208
2.24k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi11ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
2.24k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.24k
        return word & mask;
208
2.24k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi10ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
2.24k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.24k
        return word & mask;
208
2.24k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi9ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24k
    word >>= FIRST_BIT_OFFSET;
212
213
2.24k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24k
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi8ELb0EEEmPKh
Line
Count
Source
175
2.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24k
    if (BIT_WIDTH == 0) return 0;
177
178
2.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24k
    constexpr bool READ_32_BITS =
202
2.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24k
    if (READ_32_BITS) {
205
2.24k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.24k
        return word & mask;
208
2.24k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.24k
}
_ZN5doris11UnpackValueILi7ELi7ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi6ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi5ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi4ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
7.25k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
7.25k
    word >>= FIRST_BIT_OFFSET;
212
213
7.25k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
7.25k
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi3ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi2ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi1ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi7ELi0ELb0EEEmPKh
Line
Count
Source
175
7.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.25k
    if (BIT_WIDTH == 0) return 0;
177
178
7.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.25k
    constexpr bool READ_32_BITS =
202
7.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.25k
    if (READ_32_BITS) {
205
7.25k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.25k
        return word & mask;
208
7.25k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.25k
}
_ZN5doris11UnpackValueILi8ELi0ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi1ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi2ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi3ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi4ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi5ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi6ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi7ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi8ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi9ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi10ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi11ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi12ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi13ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi14ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi15ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi16ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi17ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi18ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi19ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi20ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi21ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi22ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi23ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi24ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi25ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi26ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi27ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi28ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi29ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi30ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
_ZN5doris11UnpackValueILi8ELi31ELb1EEEmPKh
Line
Count
Source
175
20.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
20.4k
    if (BIT_WIDTH == 0) return 0;
177
178
20.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
20.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
20.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
20.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
20.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
20.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
20.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
20.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
20.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
20.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
20.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
20.4k
    constexpr bool READ_32_BITS =
202
20.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
20.4k
    if (READ_32_BITS) {
205
20.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
20.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
20.4k
        return word & mask;
208
20.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
20.4k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi8ELi23ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi22ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi21ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi20ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi19ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi18ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi17ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi16ELb0EEEmPKh
Line
Count
Source
175
1.54k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.54k
    if (BIT_WIDTH == 0) return 0;
177
178
1.54k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.54k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.54k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.54k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.54k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.54k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.54k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.54k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.54k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.54k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.54k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.54k
    constexpr bool READ_32_BITS =
202
1.54k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.54k
    if (READ_32_BITS) {
205
1.54k
        uint32_t word = in[FIRST_WORD_IDX];
206
1.54k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.54k
        return word & mask;
208
1.54k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
1.54k
}
_ZN5doris11UnpackValueILi8ELi15ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi14ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi13ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi12ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi11ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi10ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi9ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi8ELb0EEEmPKh
Line
Count
Source
175
2.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.18k
    if (BIT_WIDTH == 0) return 0;
177
178
2.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.18k
    constexpr bool READ_32_BITS =
202
2.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.18k
    if (READ_32_BITS) {
205
2.18k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.18k
        return word & mask;
208
2.18k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.18k
}
_ZN5doris11UnpackValueILi8ELi7ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi6ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi5ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi4ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi3ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi2ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi1ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi8ELi0ELb0EEEmPKh
Line
Count
Source
175
7.80k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
7.80k
    if (BIT_WIDTH == 0) return 0;
177
178
7.80k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
7.80k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
7.80k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
7.80k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
7.80k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
7.80k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
7.80k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
7.80k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
7.80k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
7.80k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
7.80k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
7.80k
    constexpr bool READ_32_BITS =
202
7.80k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
7.80k
    if (READ_32_BITS) {
205
7.80k
        uint32_t word = in[FIRST_WORD_IDX];
206
7.80k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
7.80k
        return word & mask;
208
7.80k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
7.80k
}
_ZN5doris11UnpackValueILi9ELi0ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi1ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi2ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi3ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi4ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi5ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi6ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi7ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi8ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi9ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi10ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi11ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi12ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi13ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi14ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi15ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi16ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi17ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi18ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi19ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi20ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi21ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi22ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi23ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi24ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi25ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi26ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi27ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi28ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
38.1k
    word >>= FIRST_BIT_OFFSET;
212
213
38.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
38.1k
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi29ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
38.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
38.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
38.1k
        return word & mask;
208
38.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi30ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
38.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
38.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
38.1k
        return word & mask;
208
38.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
38.1k
}
_ZN5doris11UnpackValueILi9ELi31ELb1EEEmPKh
Line
Count
Source
175
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
38.1k
    if (BIT_WIDTH == 0) return 0;
177
178
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
38.1k
    constexpr bool READ_32_BITS =
202
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
38.1k
    if (READ_32_BITS) {
205
38.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
38.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
38.1k
        return word & mask;
208
38.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
38.1k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi9ELi23ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
2.89k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.89k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.89k
        return word & mask;
208
2.89k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi22ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
2.89k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.89k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.89k
        return word & mask;
208
2.89k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi21ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.89k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.89k
    word >>= FIRST_BIT_OFFSET;
212
213
2.89k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.89k
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi20ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
2.89k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.89k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.89k
        return word & mask;
208
2.89k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi19ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
2.89k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.89k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.89k
        return word & mask;
208
2.89k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi18ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
2.89k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.89k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.89k
        return word & mask;
208
2.89k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi17ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.89k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.89k
    word >>= FIRST_BIT_OFFSET;
212
213
2.89k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.89k
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi16ELb0EEEmPKh
Line
Count
Source
175
2.89k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.89k
    if (BIT_WIDTH == 0) return 0;
177
178
2.89k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.89k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.89k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.89k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.89k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.89k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.89k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.89k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.89k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.89k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.89k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.89k
    constexpr bool READ_32_BITS =
202
2.89k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.89k
    if (READ_32_BITS) {
205
2.89k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.89k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.89k
        return word & mask;
208
2.89k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.89k
}
_ZN5doris11UnpackValueILi9ELi15ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
3.97k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.97k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.97k
        return word & mask;
208
3.97k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi14ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.97k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.97k
    word >>= FIRST_BIT_OFFSET;
212
213
3.97k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.97k
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi13ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
3.97k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.97k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.97k
        return word & mask;
208
3.97k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi12ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
3.97k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.97k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.97k
        return word & mask;
208
3.97k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi11ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
3.97k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.97k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.97k
        return word & mask;
208
3.97k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi10ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.97k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.97k
    word >>= FIRST_BIT_OFFSET;
212
213
3.97k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.97k
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi9ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
3.97k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.97k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.97k
        return word & mask;
208
3.97k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi8ELb0EEEmPKh
Line
Count
Source
175
3.97k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.97k
    if (BIT_WIDTH == 0) return 0;
177
178
3.97k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.97k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.97k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.97k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.97k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.97k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.97k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.97k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.97k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.97k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.97k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.97k
    constexpr bool READ_32_BITS =
202
3.97k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.97k
    if (READ_32_BITS) {
205
3.97k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.97k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.97k
        return word & mask;
208
3.97k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.97k
}
_ZN5doris11UnpackValueILi9ELi7ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
13.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
13.1k
    word >>= FIRST_BIT_OFFSET;
212
213
13.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
13.1k
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi6ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
13.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
13.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
13.1k
        return word & mask;
208
13.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi5ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
13.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
13.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
13.1k
        return word & mask;
208
13.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi4ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
13.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
13.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
13.1k
        return word & mask;
208
13.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi3ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
13.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
13.1k
    word >>= FIRST_BIT_OFFSET;
212
213
13.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
13.1k
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi2ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
13.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
13.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
13.1k
        return word & mask;
208
13.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi1ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
13.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
13.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
13.1k
        return word & mask;
208
13.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi9ELi0ELb0EEEmPKh
Line
Count
Source
175
13.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
13.1k
    if (BIT_WIDTH == 0) return 0;
177
178
13.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
13.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
13.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
13.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
13.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
13.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
13.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
13.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
13.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
13.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
13.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
13.1k
    constexpr bool READ_32_BITS =
202
13.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
13.1k
    if (READ_32_BITS) {
205
13.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
13.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
13.1k
        return word & mask;
208
13.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
13.1k
}
_ZN5doris11UnpackValueILi10ELi0ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi1ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi2ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi3ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi4ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi5ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi6ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi7ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi8ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi9ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi10ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi11ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi12ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi13ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi14ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi15ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi16ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi17ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi18ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi19ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi20ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi21ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi22ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi23ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi24ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi25ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi26ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi27ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi28ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
36.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
36.4k
    word >>= FIRST_BIT_OFFSET;
212
213
36.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
36.4k
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi29ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
36.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
36.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
36.4k
        return word & mask;
208
36.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi30ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
36.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
36.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
36.4k
        return word & mask;
208
36.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
36.4k
}
_ZN5doris11UnpackValueILi10ELi31ELb1EEEmPKh
Line
Count
Source
175
36.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
36.4k
    if (BIT_WIDTH == 0) return 0;
177
178
36.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
36.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
36.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
36.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
36.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
36.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
36.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
36.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
36.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
36.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
36.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
36.4k
    constexpr bool READ_32_BITS =
202
36.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
36.4k
    if (READ_32_BITS) {
205
36.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
36.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
36.4k
        return word & mask;
208
36.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
36.4k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi10ELi23ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
2.86k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.86k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.86k
        return word & mask;
208
2.86k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi22ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.86k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.86k
    word >>= FIRST_BIT_OFFSET;
212
213
2.86k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.86k
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi21ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
2.86k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.86k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.86k
        return word & mask;
208
2.86k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi20ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
2.86k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.86k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.86k
        return word & mask;
208
2.86k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi19ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.86k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.86k
    word >>= FIRST_BIT_OFFSET;
212
213
2.86k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.86k
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi18ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
2.86k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.86k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.86k
        return word & mask;
208
2.86k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi17ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
2.86k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.86k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.86k
        return word & mask;
208
2.86k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi16ELb0EEEmPKh
Line
Count
Source
175
2.86k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.86k
    if (BIT_WIDTH == 0) return 0;
177
178
2.86k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.86k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.86k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.86k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.86k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.86k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.86k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.86k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.86k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.86k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.86k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.86k
    constexpr bool READ_32_BITS =
202
2.86k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.86k
    if (READ_32_BITS) {
205
2.86k
        uint32_t word = in[FIRST_WORD_IDX];
206
2.86k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.86k
        return word & mask;
208
2.86k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
2.86k
}
_ZN5doris11UnpackValueILi10ELi15ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
3.31k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.31k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.31k
        return word & mask;
208
3.31k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi14ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
3.31k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.31k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.31k
        return word & mask;
208
3.31k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi13ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
3.31k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.31k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.31k
        return word & mask;
208
3.31k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi12ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.31k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.31k
    word >>= FIRST_BIT_OFFSET;
212
213
3.31k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.31k
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi11ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
3.31k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.31k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.31k
        return word & mask;
208
3.31k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi10ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
3.31k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.31k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.31k
        return word & mask;
208
3.31k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi9ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.31k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.31k
    word >>= FIRST_BIT_OFFSET;
212
213
3.31k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.31k
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi8ELb0EEEmPKh
Line
Count
Source
175
3.31k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.31k
    if (BIT_WIDTH == 0) return 0;
177
178
3.31k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.31k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.31k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.31k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.31k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.31k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.31k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.31k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.31k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.31k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.31k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.31k
    constexpr bool READ_32_BITS =
202
3.31k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.31k
    if (READ_32_BITS) {
205
3.31k
        uint32_t word = in[FIRST_WORD_IDX];
206
3.31k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.31k
        return word & mask;
208
3.31k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
3.31k
}
_ZN5doris11UnpackValueILi10ELi7ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
11.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.5k
        return word & mask;
208
11.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi6ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.5k
    word >>= FIRST_BIT_OFFSET;
212
213
11.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.5k
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi5ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
11.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.5k
        return word & mask;
208
11.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi4ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
11.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.5k
        return word & mask;
208
11.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi3ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
11.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
11.5k
    word >>= FIRST_BIT_OFFSET;
212
213
11.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
11.5k
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi2ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
11.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.5k
        return word & mask;
208
11.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi1ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
11.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.5k
        return word & mask;
208
11.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi10ELi0ELb0EEEmPKh
Line
Count
Source
175
11.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
11.5k
    if (BIT_WIDTH == 0) return 0;
177
178
11.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
11.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
11.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
11.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
11.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
11.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
11.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
11.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
11.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
11.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
11.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
11.5k
    constexpr bool READ_32_BITS =
202
11.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
11.5k
    if (READ_32_BITS) {
205
11.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
11.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
11.5k
        return word & mask;
208
11.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
11.5k
}
_ZN5doris11UnpackValueILi11ELi0ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi1ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi2ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi3ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi4ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi5ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi6ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi7ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi8ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi9ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi10ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi11ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi12ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi13ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi14ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi15ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi16ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi17ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi18ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi19ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi20ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi21ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi22ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi23ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi24ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi25ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi26ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi27ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi28ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi29ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
518k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
518k
    word >>= FIRST_BIT_OFFSET;
212
213
518k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
518k
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi30ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
518k
        uint32_t word = in[FIRST_WORD_IDX];
206
518k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
518k
        return word & mask;
208
518k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
518k
}
_ZN5doris11UnpackValueILi11ELi31ELb1EEEmPKh
Line
Count
Source
175
518k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
518k
    if (BIT_WIDTH == 0) return 0;
177
178
518k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
518k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
518k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
518k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
518k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
518k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
518k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
518k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
518k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
518k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
518k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
518k
    constexpr bool READ_32_BITS =
202
518k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
518k
    if (READ_32_BITS) {
205
518k
        uint32_t word = in[FIRST_WORD_IDX];
206
518k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
518k
        return word & mask;
208
518k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
518k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi11ELi23ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
35.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
35.5k
    word >>= FIRST_BIT_OFFSET;
212
213
35.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
35.5k
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi22ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
35.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.5k
        return word & mask;
208
35.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi21ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
35.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.5k
        return word & mask;
208
35.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi20ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
35.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
35.5k
    word >>= FIRST_BIT_OFFSET;
212
213
35.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
35.5k
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi19ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
35.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.5k
        return word & mask;
208
35.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi18ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
35.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.5k
        return word & mask;
208
35.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi17ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
35.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
35.5k
    word >>= FIRST_BIT_OFFSET;
212
213
35.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
35.5k
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi16ELb0EEEmPKh
Line
Count
Source
175
35.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.5k
    if (BIT_WIDTH == 0) return 0;
177
178
35.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.5k
    constexpr bool READ_32_BITS =
202
35.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.5k
    if (READ_32_BITS) {
205
35.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.5k
        return word & mask;
208
35.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.5k
}
_ZN5doris11UnpackValueILi11ELi15ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
35.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.7k
        return word & mask;
208
35.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi14ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
35.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
35.7k
    word >>= FIRST_BIT_OFFSET;
212
213
35.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
35.7k
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi13ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
35.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.7k
        return word & mask;
208
35.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi12ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
35.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.7k
        return word & mask;
208
35.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi11ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
35.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
35.7k
    word >>= FIRST_BIT_OFFSET;
212
213
35.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
35.7k
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi10ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
35.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.7k
        return word & mask;
208
35.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi9ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
35.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
35.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
35.7k
        return word & mask;
208
35.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi8ELb0EEEmPKh
Line
Count
Source
175
35.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
35.7k
    if (BIT_WIDTH == 0) return 0;
177
178
35.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
35.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
35.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
35.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
35.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
35.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
35.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
35.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
35.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
35.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
35.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
35.7k
    constexpr bool READ_32_BITS =
202
35.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
35.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
35.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
35.7k
    word >>= FIRST_BIT_OFFSET;
212
213
35.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
35.7k
    return word & mask;
220
35.7k
}
_ZN5doris11UnpackValueILi11ELi7ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
40.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
40.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
40.0k
        return word & mask;
208
40.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi6ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
40.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
40.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
40.0k
        return word & mask;
208
40.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi5ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
40.0k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
40.0k
    word >>= FIRST_BIT_OFFSET;
212
213
40.0k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
40.0k
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi4ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
40.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
40.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
40.0k
        return word & mask;
208
40.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi3ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
40.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
40.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
40.0k
        return word & mask;
208
40.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi2ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
40.0k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
40.0k
    word >>= FIRST_BIT_OFFSET;
212
213
40.0k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
40.0k
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi1ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
40.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
40.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
40.0k
        return word & mask;
208
40.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi11ELi0ELb0EEEmPKh
Line
Count
Source
175
40.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
40.0k
    if (BIT_WIDTH == 0) return 0;
177
178
40.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
40.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
40.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
40.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
40.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
40.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
40.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
40.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
40.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
40.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
40.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
40.0k
    constexpr bool READ_32_BITS =
202
40.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
40.0k
    if (READ_32_BITS) {
205
40.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
40.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
40.0k
        return word & mask;
208
40.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
40.0k
}
_ZN5doris11UnpackValueILi12ELi0ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi1ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi2ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi3ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi4ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi5ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi6ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi7ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi8ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi9ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi10ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi11ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi12ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi13ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi14ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi15ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi16ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi17ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi18ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi19ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi20ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi21ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi22ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi23ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi24ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi25ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi26ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi27ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi28ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi29ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
3.78M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.78M
    word >>= FIRST_BIT_OFFSET;
212
213
3.78M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.78M
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi30ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
3.78M
        uint32_t word = in[FIRST_WORD_IDX];
206
3.78M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.78M
        return word & mask;
208
3.78M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
3.78M
}
_ZN5doris11UnpackValueILi12ELi31ELb1EEEmPKh
Line
Count
Source
175
3.78M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
3.78M
    if (BIT_WIDTH == 0) return 0;
177
178
3.78M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
3.78M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
3.78M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
3.78M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
3.78M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
3.78M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
3.78M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
3.78M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
3.78M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
3.78M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
3.78M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
3.78M
    constexpr bool READ_32_BITS =
202
3.78M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
3.78M
    if (READ_32_BITS) {
205
3.78M
        uint32_t word = in[FIRST_WORD_IDX];
206
3.78M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
3.78M
        return word & mask;
208
3.78M
    }
209
210
34
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
34
    word >>= FIRST_BIT_OFFSET;
212
213
34
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
34
    return word & mask;
220
3.78M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi12ELi23ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
256k
        uint32_t word = in[FIRST_WORD_IDX];
206
256k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
256k
        return word & mask;
208
256k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi22ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
256k
        uint32_t word = in[FIRST_WORD_IDX];
206
256k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
256k
        return word & mask;
208
256k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi21ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
256k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
256k
    word >>= FIRST_BIT_OFFSET;
212
213
256k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
256k
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi20ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
256k
        uint32_t word = in[FIRST_WORD_IDX];
206
256k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
256k
        return word & mask;
208
256k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi19ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
256k
        uint32_t word = in[FIRST_WORD_IDX];
206
256k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
256k
        return word & mask;
208
256k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi18ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
256k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
256k
    word >>= FIRST_BIT_OFFSET;
212
213
256k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
256k
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi17ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
256k
        uint32_t word = in[FIRST_WORD_IDX];
206
256k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
256k
        return word & mask;
208
256k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi16ELb0EEEmPKh
Line
Count
Source
175
256k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
256k
    if (BIT_WIDTH == 0) return 0;
177
178
256k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
256k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
256k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
256k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
256k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
256k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
256k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
256k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
256k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
256k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
256k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
256k
    constexpr bool READ_32_BITS =
202
256k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
256k
    if (READ_32_BITS) {
205
256k
        uint32_t word = in[FIRST_WORD_IDX];
206
256k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
256k
        return word & mask;
208
256k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
256k
}
_ZN5doris11UnpackValueILi12ELi15ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
257k
        uint32_t word = in[FIRST_WORD_IDX];
206
257k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
257k
        return word & mask;
208
257k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi14ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
257k
        uint32_t word = in[FIRST_WORD_IDX];
206
257k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
257k
        return word & mask;
208
257k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi13ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
257k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
257k
    word >>= FIRST_BIT_OFFSET;
212
213
257k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
257k
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi12ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
257k
        uint32_t word = in[FIRST_WORD_IDX];
206
257k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
257k
        return word & mask;
208
257k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi11ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
257k
        uint32_t word = in[FIRST_WORD_IDX];
206
257k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
257k
        return word & mask;
208
257k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi10ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
257k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
257k
    word >>= FIRST_BIT_OFFSET;
212
213
257k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
257k
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi9ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
257k
        uint32_t word = in[FIRST_WORD_IDX];
206
257k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
257k
        return word & mask;
208
257k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi8ELb0EEEmPKh
Line
Count
Source
175
257k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
257k
    if (BIT_WIDTH == 0) return 0;
177
178
257k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
257k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
257k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
257k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
257k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
257k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
257k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
257k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
257k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
257k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
257k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
257k
    constexpr bool READ_32_BITS =
202
257k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
257k
    if (READ_32_BITS) {
205
257k
        uint32_t word = in[FIRST_WORD_IDX];
206
257k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
257k
        return word & mask;
208
257k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
257k
}
_ZN5doris11UnpackValueILi12ELi7ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
266k
        uint32_t word = in[FIRST_WORD_IDX];
206
266k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
266k
        return word & mask;
208
266k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi6ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
266k
        uint32_t word = in[FIRST_WORD_IDX];
206
266k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
266k
        return word & mask;
208
266k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi5ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
266k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
266k
    word >>= FIRST_BIT_OFFSET;
212
213
266k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
266k
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi4ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
266k
        uint32_t word = in[FIRST_WORD_IDX];
206
266k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
266k
        return word & mask;
208
266k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi3ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
266k
        uint32_t word = in[FIRST_WORD_IDX];
206
266k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
266k
        return word & mask;
208
266k
    }
209
210
10
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
10
    word >>= FIRST_BIT_OFFSET;
212
213
10
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
10
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi2ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
266k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
266k
    word >>= FIRST_BIT_OFFSET;
212
213
266k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
266k
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi1ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
266k
        uint32_t word = in[FIRST_WORD_IDX];
206
266k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
266k
        return word & mask;
208
266k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi12ELi0ELb0EEEmPKh
Line
Count
Source
175
266k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
266k
    if (BIT_WIDTH == 0) return 0;
177
178
266k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
266k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
266k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
266k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
266k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
266k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
266k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
266k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
266k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
266k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
266k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
266k
    constexpr bool READ_32_BITS =
202
266k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
266k
    if (READ_32_BITS) {
205
266k
        uint32_t word = in[FIRST_WORD_IDX];
206
266k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
266k
        return word & mask;
208
266k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
266k
}
_ZN5doris11UnpackValueILi13ELi0ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi1ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi2ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi3ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi4ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi5ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi6ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi7ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi8ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi9ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi10ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi11ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi12ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi13ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi14ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi15ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi16ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi17ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi18ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi19ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi20ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi21ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi22ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi23ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi24ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi25ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi26ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi27ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi28ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi29ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
350k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
350k
    word >>= FIRST_BIT_OFFSET;
212
213
350k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
350k
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi30ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
350k
        uint32_t word = in[FIRST_WORD_IDX];
206
350k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
350k
        return word & mask;
208
350k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
350k
}
_ZN5doris11UnpackValueILi13ELi31ELb1EEEmPKh
Line
Count
Source
175
350k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
350k
    if (BIT_WIDTH == 0) return 0;
177
178
350k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
350k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
350k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
350k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
350k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
350k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
350k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
350k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
350k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
350k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
350k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
350k
    constexpr bool READ_32_BITS =
202
350k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
350k
    if (READ_32_BITS) {
205
350k
        uint32_t word = in[FIRST_WORD_IDX];
206
350k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
350k
        return word & mask;
208
350k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
350k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi13ELi23ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
27.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
27.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
27.4k
        return word & mask;
208
27.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi22ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
27.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
27.4k
    word >>= FIRST_BIT_OFFSET;
212
213
27.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
27.4k
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi21ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
27.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
27.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
27.4k
        return word & mask;
208
27.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi20ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
27.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
27.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
27.4k
        return word & mask;
208
27.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi19ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
27.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
27.4k
    word >>= FIRST_BIT_OFFSET;
212
213
27.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
27.4k
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi18ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
27.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
27.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
27.4k
        return word & mask;
208
27.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi17ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
27.4k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
27.4k
    word >>= FIRST_BIT_OFFSET;
212
213
27.4k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
27.4k
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi16ELb0EEEmPKh
Line
Count
Source
175
27.4k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
27.4k
    if (BIT_WIDTH == 0) return 0;
177
178
27.4k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
27.4k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
27.4k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
27.4k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
27.4k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
27.4k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
27.4k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
27.4k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
27.4k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
27.4k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
27.4k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
27.4k
    constexpr bool READ_32_BITS =
202
27.4k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
27.4k
    if (READ_32_BITS) {
205
27.4k
        uint32_t word = in[FIRST_WORD_IDX];
206
27.4k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
27.4k
        return word & mask;
208
27.4k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
27.4k
}
_ZN5doris11UnpackValueILi13ELi15ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
29.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.0k
        return word & mask;
208
29.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi14ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.0k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.0k
    word >>= FIRST_BIT_OFFSET;
212
213
29.0k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.0k
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi13ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
29.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.0k
        return word & mask;
208
29.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi12ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.0k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.0k
    word >>= FIRST_BIT_OFFSET;
212
213
29.0k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.0k
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi11ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
29.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.0k
        return word & mask;
208
29.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi10ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
29.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.0k
        return word & mask;
208
29.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi9ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.0k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.0k
    word >>= FIRST_BIT_OFFSET;
212
213
29.0k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.0k
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi8ELb0EEEmPKh
Line
Count
Source
175
29.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.0k
    if (BIT_WIDTH == 0) return 0;
177
178
29.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.0k
    constexpr bool READ_32_BITS =
202
29.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.0k
    if (READ_32_BITS) {
205
29.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.0k
        return word & mask;
208
29.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.0k
}
_ZN5doris11UnpackValueILi13ELi7ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
42.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
42.2k
    word >>= FIRST_BIT_OFFSET;
212
213
42.2k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
42.2k
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi6ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
42.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
42.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
42.2k
        return word & mask;
208
42.2k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi5ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
42.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
42.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
42.2k
        return word & mask;
208
42.2k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi4ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
42.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
42.2k
    word >>= FIRST_BIT_OFFSET;
212
213
42.2k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
42.2k
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi3ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
42.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
42.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
42.2k
        return word & mask;
208
42.2k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi2ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
42.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
42.2k
    word >>= FIRST_BIT_OFFSET;
212
213
42.2k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
42.2k
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi1ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
42.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
42.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
42.2k
        return word & mask;
208
42.2k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi13ELi0ELb0EEEmPKh
Line
Count
Source
175
42.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
42.2k
    if (BIT_WIDTH == 0) return 0;
177
178
42.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
42.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
42.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
42.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
42.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
42.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
42.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
42.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
42.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
42.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
42.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
42.2k
    constexpr bool READ_32_BITS =
202
42.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
42.2k
    if (READ_32_BITS) {
205
42.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
42.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
42.2k
        return word & mask;
208
42.2k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
42.2k
}
_ZN5doris11UnpackValueILi14ELi0ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi1ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi2ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi3ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi4ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi5ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi6ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi7ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi8ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi9ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi10ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi11ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi12ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi13ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi14ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi15ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi16ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi17ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.25M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.25M
    word >>= FIRST_BIT_OFFSET;
212
213
2.25M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.25M
    return word & mask;
220
2.25M
}
_ZN5doris11UnpackValueILi14ELi18ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi19ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi20ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi21ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi22ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi23ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi24ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi25ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi26ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi27ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi28ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi29ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.24M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
2.24M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.24M
    word >>= FIRST_BIT_OFFSET;
212
213
2.24M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.24M
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi30ELb1EEEmPKh
Line
Count
Source
175
2.24M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.24M
    if (BIT_WIDTH == 0) return 0;
177
178
2.24M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.24M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.24M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.24M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.24M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.24M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.24M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.24M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.24M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.24M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.24M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.24M
    constexpr bool READ_32_BITS =
202
2.24M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
2.25M
        uint32_t word = in[FIRST_WORD_IDX];
206
2.25M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.25M
        return word & mask;
208
2.25M
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
2.24M
}
_ZN5doris11UnpackValueILi14ELi31ELb1EEEmPKh
Line
Count
Source
175
2.25M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
2.25M
    if (BIT_WIDTH == 0) return 0;
177
178
2.25M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
2.25M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
2.25M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
2.25M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
2.25M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
2.25M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
2.25M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
2.25M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
2.25M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
2.25M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
2.25M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
2.25M
    constexpr bool READ_32_BITS =
202
2.25M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
2.25M
    if (READ_32_BITS) {
205
2.25M
        uint32_t word = in[FIRST_WORD_IDX];
206
2.25M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
2.25M
        return word & mask;
208
2.25M
    }
209
210
330
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
330
    word >>= FIRST_BIT_OFFSET;
212
213
330
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
330
    return word & mask;
220
2.25M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi14ELi23ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
158k
        uint32_t word = in[FIRST_WORD_IDX];
206
158k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
158k
        return word & mask;
208
158k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi22ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
158k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
158k
    word >>= FIRST_BIT_OFFSET;
212
213
158k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
158k
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi21ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
158k
        uint32_t word = in[FIRST_WORD_IDX];
206
158k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
158k
        return word & mask;
208
158k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi20ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
158k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
158k
    word >>= FIRST_BIT_OFFSET;
212
213
158k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
158k
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi19ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
158k
        uint32_t word = in[FIRST_WORD_IDX];
206
158k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
158k
        return word & mask;
208
158k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi18ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
158k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
158k
    word >>= FIRST_BIT_OFFSET;
212
213
158k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
158k
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi17ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
158k
        uint32_t word = in[FIRST_WORD_IDX];
206
158k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
158k
        return word & mask;
208
158k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi16ELb0EEEmPKh
Line
Count
Source
175
158k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
158k
    if (BIT_WIDTH == 0) return 0;
177
178
158k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
158k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
158k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
158k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
158k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
158k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
158k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
158k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
158k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
158k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
158k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
158k
    constexpr bool READ_32_BITS =
202
158k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
158k
    if (READ_32_BITS) {
205
158k
        uint32_t word = in[FIRST_WORD_IDX];
206
158k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
158k
        return word & mask;
208
158k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
158k
}
_ZN5doris11UnpackValueILi14ELi15ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
159k
        uint32_t word = in[FIRST_WORD_IDX];
206
159k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
159k
        return word & mask;
208
159k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi14ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
159k
        uint32_t word = in[FIRST_WORD_IDX];
206
159k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
159k
        return word & mask;
208
159k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi13ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
159k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
159k
    word >>= FIRST_BIT_OFFSET;
212
213
159k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
159k
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi12ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
159k
        uint32_t word = in[FIRST_WORD_IDX];
206
159k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
159k
        return word & mask;
208
159k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi11ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
159k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
159k
    word >>= FIRST_BIT_OFFSET;
212
213
159k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
159k
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi10ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
159k
        uint32_t word = in[FIRST_WORD_IDX];
206
159k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
159k
        return word & mask;
208
159k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi9ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
159k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
159k
    word >>= FIRST_BIT_OFFSET;
212
213
159k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
159k
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi8ELb0EEEmPKh
Line
Count
Source
175
159k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
159k
    if (BIT_WIDTH == 0) return 0;
177
178
159k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
159k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
159k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
159k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
159k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
159k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
159k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
159k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
159k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
159k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
159k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
159k
    constexpr bool READ_32_BITS =
202
159k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
159k
    if (READ_32_BITS) {
205
159k
        uint32_t word = in[FIRST_WORD_IDX];
206
159k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
159k
        return word & mask;
208
159k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
159k
}
_ZN5doris11UnpackValueILi14ELi7ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
162k
        uint32_t word = in[FIRST_WORD_IDX];
206
162k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
162k
        return word & mask;
208
162k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi6ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
162k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
162k
    word >>= FIRST_BIT_OFFSET;
212
213
162k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
162k
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi5ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
162k
        uint32_t word = in[FIRST_WORD_IDX];
206
162k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
162k
        return word & mask;
208
162k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi4ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
162k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
162k
    word >>= FIRST_BIT_OFFSET;
212
213
162k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
162k
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi3ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
162k
        uint32_t word = in[FIRST_WORD_IDX];
206
162k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
162k
        return word & mask;
208
162k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi2ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
162k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
162k
    word >>= FIRST_BIT_OFFSET;
212
213
162k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
162k
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi1ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
162k
        uint32_t word = in[FIRST_WORD_IDX];
206
162k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
162k
        return word & mask;
208
162k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi14ELi0ELb0EEEmPKh
Line
Count
Source
175
162k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
162k
    if (BIT_WIDTH == 0) return 0;
177
178
162k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
162k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
162k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
162k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
162k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
162k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
162k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
162k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
162k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
162k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
162k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
162k
    constexpr bool READ_32_BITS =
202
162k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
162k
    if (READ_32_BITS) {
205
162k
        uint32_t word = in[FIRST_WORD_IDX];
206
162k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
162k
        return word & mask;
208
162k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
162k
}
_ZN5doris11UnpackValueILi15ELi0ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi1ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi2ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi3ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi4ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi5ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi6ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi7ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi8ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi9ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi10ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi11ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi12ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi13ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi14ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi15ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi16ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi17ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi18ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi19ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi20ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi21ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi22ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi23ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi24ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi25ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi26ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi27ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi28ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi29ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
369k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
369k
    word >>= FIRST_BIT_OFFSET;
212
213
369k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
369k
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi30ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
369k
        uint32_t word = in[FIRST_WORD_IDX];
206
369k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
369k
        return word & mask;
208
369k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
369k
}
_ZN5doris11UnpackValueILi15ELi31ELb1EEEmPKh
Line
Count
Source
175
369k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
369k
    if (BIT_WIDTH == 0) return 0;
177
178
369k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
369k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
369k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
369k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
369k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
369k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
369k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
369k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
369k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
369k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
369k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
369k
    constexpr bool READ_32_BITS =
202
369k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
369k
    if (READ_32_BITS) {
205
369k
        uint32_t word = in[FIRST_WORD_IDX];
206
369k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
369k
        return word & mask;
208
369k
    }
209
210
12
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
12
    word >>= FIRST_BIT_OFFSET;
212
213
12
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
12
    return word & mask;
220
369k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi15ELi23ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.5k
    word >>= FIRST_BIT_OFFSET;
212
213
29.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.5k
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi22ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
29.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.5k
        return word & mask;
208
29.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi21ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.5k
    word >>= FIRST_BIT_OFFSET;
212
213
29.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.5k
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi20ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
29.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.5k
        return word & mask;
208
29.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi19ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.5k
    word >>= FIRST_BIT_OFFSET;
212
213
29.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.5k
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi18ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
29.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.5k
        return word & mask;
208
29.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi17ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
29.5k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
29.5k
    word >>= FIRST_BIT_OFFSET;
212
213
29.5k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
29.5k
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi16ELb0EEEmPKh
Line
Count
Source
175
29.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
29.5k
    if (BIT_WIDTH == 0) return 0;
177
178
29.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
29.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
29.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
29.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
29.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
29.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
29.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
29.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
29.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
29.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
29.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
29.5k
    constexpr bool READ_32_BITS =
202
29.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
29.5k
    if (READ_32_BITS) {
205
29.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
29.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
29.5k
        return word & mask;
208
29.5k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
29.5k
}
_ZN5doris11UnpackValueILi15ELi15ELb0EEEmPKh
Line
Count
Source
175
33.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.1k
    if (BIT_WIDTH == 0) return 0;
177
178
33.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.1k
    constexpr bool READ_32_BITS =
202
33.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.1k
    if (READ_32_BITS) {
205
33.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
33.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
33.1k
        return word & mask;
208
33.1k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
33.1k
}
_ZN5doris11UnpackValueILi15ELi14ELb0EEEmPKh
Line
Count
Source
175
33.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.1k
    if (BIT_WIDTH == 0) return 0;
177
178
33.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.1k
    constexpr bool READ_32_BITS =
202
33.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
33.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
33.1k
    word >>= FIRST_BIT_OFFSET;
212
213
33.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
33.1k
    return word & mask;
220
33.1k
}
_ZN5doris11UnpackValueILi15ELi13ELb0EEEmPKh
Line
Count
Source
175
33.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.1k
    if (BIT_WIDTH == 0) return 0;
177
178
33.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.1k
    constexpr bool READ_32_BITS =
202
33.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.1k
    if (READ_32_BITS) {
205
33.1k
        uint32_t word = in[FIRST_WORD_IDX];
206
33.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
33.1k
        return word & mask;
208
33.1k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
33.1k
}
_ZN5doris11UnpackValueILi15ELi12ELb0EEEmPKh
Line
Count
Source
175
33.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.1k
    if (BIT_WIDTH == 0) return 0;
177
178
33.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.1k
    constexpr bool READ_32_BITS =
202
33.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
33.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
33.1k
    word >>= FIRST_BIT_OFFSET;
212
213
33.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
33.1k
    return word & mask;
220
33.1k
}
_ZN5doris11UnpackValueILi15ELi11ELb0EEEmPKh
Line
Count
Source
175
33.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.0k
    if (BIT_WIDTH == 0) return 0;
177
178
33.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.0k
    constexpr bool READ_32_BITS =
202
33.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.0k
    if (READ_32_BITS) {
205
33.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
33.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
33.0k
        return word & mask;
208
33.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
33.0k
}
_ZN5doris11UnpackValueILi15ELi10ELb0EEEmPKh
Line
Count
Source
175
33.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.1k
    if (BIT_WIDTH == 0) return 0;
177
178
33.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.1k
    constexpr bool READ_32_BITS =
202
33.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.1k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
33.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
33.1k
    word >>= FIRST_BIT_OFFSET;
212
213
33.1k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
33.1k
    return word & mask;
220
33.1k
}
_ZN5doris11UnpackValueILi15ELi9ELb0EEEmPKh
Line
Count
Source
175
33.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.1k
    if (BIT_WIDTH == 0) return 0;
177
178
33.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.1k
    constexpr bool READ_32_BITS =
202
33.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.1k
    if (READ_32_BITS) {
205
33.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
33.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
33.0k
        return word & mask;
208
33.0k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
33.1k
}
_ZN5doris11UnpackValueILi15ELi8ELb0EEEmPKh
Line
Count
Source
175
33.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
33.0k
    if (BIT_WIDTH == 0) return 0;
177
178
33.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
33.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
33.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
33.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
33.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
33.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
33.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
33.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
33.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
33.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
33.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
33.0k
    constexpr bool READ_32_BITS =
202
33.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
33.0k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
33.0k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
33.0k
    word >>= FIRST_BIT_OFFSET;
212
213
33.0k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
33.0k
    return word & mask;
220
33.0k
}
_ZN5doris11UnpackValueILi15ELi7ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
37.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.6k
        return word & mask;
208
37.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi6ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.6k
    word >>= FIRST_BIT_OFFSET;
212
213
37.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.6k
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi5ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
37.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.6k
        return word & mask;
208
37.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi4ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.6k
    word >>= FIRST_BIT_OFFSET;
212
213
37.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.6k
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi3ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
37.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.6k
        return word & mask;
208
37.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi2ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
37.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
37.6k
    word >>= FIRST_BIT_OFFSET;
212
213
37.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
37.6k
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi1ELb0EEEmPKh
Line
Count
Source
175
37.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.6k
    if (BIT_WIDTH == 0) return 0;
177
178
37.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.6k
    constexpr bool READ_32_BITS =
202
37.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.6k
    if (READ_32_BITS) {
205
37.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.6k
        return word & mask;
208
37.6k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
37.6k
}
_ZN5doris11UnpackValueILi15ELi0ELb0EEEmPKh
Line
Count
Source
175
37.5k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
37.5k
    if (BIT_WIDTH == 0) return 0;
177
178
37.5k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
37.5k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
37.5k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
37.5k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
37.5k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
37.5k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
37.5k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
37.5k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
37.5k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
37.5k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
37.5k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
37.5k
    constexpr bool READ_32_BITS =
202
37.5k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
37.5k
    if (READ_32_BITS) {
205
37.5k
        uint32_t word = in[FIRST_WORD_IDX];
206
37.5k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
37.5k
        return word & mask;
208
37.5k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
37.5k
}
_ZN5doris11UnpackValueILi16ELi0ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
126
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
126
    word >>= FIRST_BIT_OFFSET;
212
213
126
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
126
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi1ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
106
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
106
    word >>= FIRST_BIT_OFFSET;
212
213
106
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
106
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi2ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
398
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
398
    word >>= FIRST_BIT_OFFSET;
212
213
398
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
398
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi3ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
310
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
310
    word >>= FIRST_BIT_OFFSET;
212
213
310
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
310
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi4ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
659k
        uint32_t word = in[FIRST_WORD_IDX];
206
659k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
659k
        return word & mask;
208
659k
    }
209
210
174
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
174
    word >>= FIRST_BIT_OFFSET;
212
213
174
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
174
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi5ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
659k
        uint32_t word = in[FIRST_WORD_IDX];
206
659k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
659k
        return word & mask;
208
659k
    }
209
210
706
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
706
    word >>= FIRST_BIT_OFFSET;
212
213
706
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
706
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi6ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
669k
    if (READ_32_BITS) {
205
669k
        uint32_t word = in[FIRST_WORD_IDX];
206
669k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
669k
        return word & mask;
208
669k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
660k
}
_ZN5doris11UnpackValueILi16ELi7ELb1EEEmPKh
Line
Count
Source
175
673k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
673k
    if (BIT_WIDTH == 0) return 0;
177
178
673k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
673k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
673k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
673k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
673k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
673k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
673k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
673k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
673k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
673k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
673k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
673k
    constexpr bool READ_32_BITS =
202
673k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
673k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
5.60k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
5.60k
    word >>= FIRST_BIT_OFFSET;
212
213
5.60k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
5.60k
    return word & mask;
220
673k
}
_ZN5doris11UnpackValueILi16ELi8ELb1EEEmPKh
Line
Count
Source
175
673k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
673k
    if (BIT_WIDTH == 0) return 0;
177
178
673k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
673k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
673k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
673k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
673k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
673k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
673k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
673k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
673k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
673k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
673k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
673k
    constexpr bool READ_32_BITS =
202
673k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
673k
    if (READ_32_BITS) {
205
671k
        uint32_t word = in[FIRST_WORD_IDX];
206
671k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
671k
        return word & mask;
208
671k
    }
209
210
2.69k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.69k
    word >>= FIRST_BIT_OFFSET;
212
213
2.69k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.69k
    return word & mask;
220
673k
}
_ZN5doris11UnpackValueILi16ELi9ELb1EEEmPKh
Line
Count
Source
175
672k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
672k
    if (BIT_WIDTH == 0) return 0;
177
178
672k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
672k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
672k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
672k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
672k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
672k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
672k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
672k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
672k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
672k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
672k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
672k
    constexpr bool READ_32_BITS =
202
672k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
672k
    if (READ_32_BITS) {
205
670k
        uint32_t word = in[FIRST_WORD_IDX];
206
670k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
670k
        return word & mask;
208
670k
    }
209
210
2.52k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.52k
    word >>= FIRST_BIT_OFFSET;
212
213
2.52k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.52k
    return word & mask;
220
672k
}
_ZN5doris11UnpackValueILi16ELi10ELb1EEEmPKh
Line
Count
Source
175
672k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
672k
    if (BIT_WIDTH == 0) return 0;
177
178
672k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
672k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
672k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
672k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
672k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
672k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
672k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
672k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
672k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
672k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
672k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
672k
    constexpr bool READ_32_BITS =
202
672k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
672k
    if (READ_32_BITS) {
205
669k
        uint32_t word = in[FIRST_WORD_IDX];
206
669k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
669k
        return word & mask;
208
669k
    }
209
210
3.31k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.31k
    word >>= FIRST_BIT_OFFSET;
212
213
3.31k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.31k
    return word & mask;
220
672k
}
_ZN5doris11UnpackValueILi16ELi11ELb1EEEmPKh
Line
Count
Source
175
672k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
672k
    if (BIT_WIDTH == 0) return 0;
177
178
672k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
672k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
672k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
672k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
672k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
672k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
672k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
672k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
672k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
672k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
672k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
672k
    constexpr bool READ_32_BITS =
202
672k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
672k
    if (READ_32_BITS) {
205
668k
        uint32_t word = in[FIRST_WORD_IDX];
206
668k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
668k
        return word & mask;
208
668k
    }
209
210
3.42k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.42k
    word >>= FIRST_BIT_OFFSET;
212
213
3.42k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.42k
    return word & mask;
220
672k
}
_ZN5doris11UnpackValueILi16ELi12ELb1EEEmPKh
Line
Count
Source
175
671k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
671k
    if (BIT_WIDTH == 0) return 0;
177
178
671k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
671k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
671k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
671k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
671k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
671k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
671k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
671k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
671k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
671k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
671k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
671k
    constexpr bool READ_32_BITS =
202
671k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
671k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
3.60k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.60k
    word >>= FIRST_BIT_OFFSET;
212
213
3.60k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.60k
    return word & mask;
220
671k
}
_ZN5doris11UnpackValueILi16ELi13ELb1EEEmPKh
Line
Count
Source
175
671k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
671k
    if (BIT_WIDTH == 0) return 0;
177
178
671k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
671k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
671k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
671k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
671k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
671k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
671k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
671k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
671k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
671k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
671k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
671k
    constexpr bool READ_32_BITS =
202
671k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
671k
    if (READ_32_BITS) {
205
668k
        uint32_t word = in[FIRST_WORD_IDX];
206
668k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
668k
        return word & mask;
208
668k
    }
209
210
3.03k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.03k
    word >>= FIRST_BIT_OFFSET;
212
213
3.03k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.03k
    return word & mask;
220
671k
}
_ZN5doris11UnpackValueILi16ELi14ELb1EEEmPKh
Line
Count
Source
175
670k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
670k
    if (BIT_WIDTH == 0) return 0;
177
178
670k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
670k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
670k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
670k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
670k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
670k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
670k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
670k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
670k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
670k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
670k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
670k
    constexpr bool READ_32_BITS =
202
670k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
670k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
2.29k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.29k
    word >>= FIRST_BIT_OFFSET;
212
213
2.29k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.29k
    return word & mask;
220
670k
}
_ZN5doris11UnpackValueILi16ELi15ELb1EEEmPKh
Line
Count
Source
175
670k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
670k
    if (BIT_WIDTH == 0) return 0;
177
178
670k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
670k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
670k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
670k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
670k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
670k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
670k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
670k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
670k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
670k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
670k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
670k
    constexpr bool READ_32_BITS =
202
670k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
670k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
2.27k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.27k
    word >>= FIRST_BIT_OFFSET;
212
213
2.27k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.27k
    return word & mask;
220
670k
}
_ZN5doris11UnpackValueILi16ELi16ELb1EEEmPKh
Line
Count
Source
175
667k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
667k
    if (BIT_WIDTH == 0) return 0;
177
178
667k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
667k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
667k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
667k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
667k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
667k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
667k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
667k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
667k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
667k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
667k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
667k
    constexpr bool READ_32_BITS =
202
667k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
667k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
298
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
298
    word >>= FIRST_BIT_OFFSET;
212
213
298
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
298
    return word & mask;
220
667k
}
_ZN5doris11UnpackValueILi16ELi17ELb1EEEmPKh
Line
Count
Source
175
667k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
667k
    if (BIT_WIDTH == 0) return 0;
177
178
667k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
667k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
667k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
667k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
667k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
667k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
667k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
667k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
667k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
667k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
667k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
667k
    constexpr bool READ_32_BITS =
202
667k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
667k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
72
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
72
    word >>= FIRST_BIT_OFFSET;
212
213
72
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
72
    return word & mask;
220
667k
}
_ZN5doris11UnpackValueILi16ELi18ELb1EEEmPKh
Line
Count
Source
175
667k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
667k
    if (BIT_WIDTH == 0) return 0;
177
178
667k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
667k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
667k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
667k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
667k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
667k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
667k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
667k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
667k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
667k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
667k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
667k
    constexpr bool READ_32_BITS =
202
667k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
667k
    if (READ_32_BITS) {
205
666k
        uint32_t word = in[FIRST_WORD_IDX];
206
666k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
666k
        return word & mask;
208
666k
    }
209
210
856
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
856
    word >>= FIRST_BIT_OFFSET;
212
213
856
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
856
    return word & mask;
220
667k
}
_ZN5doris11UnpackValueILi16ELi19ELb1EEEmPKh
Line
Count
Source
175
667k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
667k
    if (BIT_WIDTH == 0) return 0;
177
178
667k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
667k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
667k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
667k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
667k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
667k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
667k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
667k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
667k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
667k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
667k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
667k
    constexpr bool READ_32_BITS =
202
667k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
667k
    if (READ_32_BITS) {
205
667k
        uint32_t word = in[FIRST_WORD_IDX];
206
667k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
667k
        return word & mask;
208
667k
    }
209
210
698
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
698
    word >>= FIRST_BIT_OFFSET;
212
213
698
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
698
    return word & mask;
220
667k
}
_ZN5doris11UnpackValueILi16ELi20ELb1EEEmPKh
Line
Count
Source
175
667k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
667k
    if (BIT_WIDTH == 0) return 0;
177
178
667k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
667k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
667k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
667k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
667k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
667k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
667k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
667k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
667k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
667k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
667k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
667k
    constexpr bool READ_32_BITS =
202
667k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
667k
    if (READ_32_BITS) {
205
666k
        uint32_t word = in[FIRST_WORD_IDX];
206
666k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
666k
        return word & mask;
208
666k
    }
209
210
1.16k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.16k
    word >>= FIRST_BIT_OFFSET;
212
213
1.16k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.16k
    return word & mask;
220
667k
}
_ZN5doris11UnpackValueILi16ELi21ELb1EEEmPKh
Line
Count
Source
175
667k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
667k
    if (BIT_WIDTH == 0) return 0;
177
178
667k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
667k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
667k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
667k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
667k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
667k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
667k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
667k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
667k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
667k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
667k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
667k
    constexpr bool READ_32_BITS =
202
667k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
667k
    if (READ_32_BITS) {
205
665k
        uint32_t word = in[FIRST_WORD_IDX];
206
665k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
665k
        return word & mask;
208
665k
    }
209
210
1.54k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.54k
    word >>= FIRST_BIT_OFFSET;
212
213
1.54k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.54k
    return word & mask;
220
667k
}
_ZN5doris11UnpackValueILi16ELi22ELb1EEEmPKh
Line
Count
Source
175
666k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
666k
    if (BIT_WIDTH == 0) return 0;
177
178
666k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
666k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
666k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
666k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
666k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
666k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
666k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
666k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
666k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
666k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
666k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
666k
    constexpr bool READ_32_BITS =
202
666k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
666k
    if (READ_32_BITS) {
205
665k
        uint32_t word = in[FIRST_WORD_IDX];
206
665k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
665k
        return word & mask;
208
665k
    }
209
210
1.07k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.07k
    word >>= FIRST_BIT_OFFSET;
212
213
1.07k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.07k
    return word & mask;
220
666k
}
_ZN5doris11UnpackValueILi16ELi23ELb1EEEmPKh
Line
Count
Source
175
665k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
665k
    if (BIT_WIDTH == 0) return 0;
177
178
665k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
665k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
665k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
665k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
665k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
665k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
665k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
665k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
665k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
665k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
665k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
665k
    constexpr bool READ_32_BITS =
202
665k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
665k
    if (READ_32_BITS) {
205
664k
        uint32_t word = in[FIRST_WORD_IDX];
206
664k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
664k
        return word & mask;
208
664k
    }
209
210
1.03k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.03k
    word >>= FIRST_BIT_OFFSET;
212
213
1.03k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.03k
    return word & mask;
220
665k
}
_ZN5doris11UnpackValueILi16ELi24ELb1EEEmPKh
Line
Count
Source
175
665k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
665k
    if (BIT_WIDTH == 0) return 0;
177
178
665k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
665k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
665k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
665k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
665k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
665k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
665k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
665k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
665k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
665k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
665k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
665k
    constexpr bool READ_32_BITS =
202
665k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
665k
    if (READ_32_BITS) {
205
662k
        uint32_t word = in[FIRST_WORD_IDX];
206
662k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
662k
        return word & mask;
208
662k
    }
209
210
2.78k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.78k
    word >>= FIRST_BIT_OFFSET;
212
213
2.78k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.78k
    return word & mask;
220
665k
}
_ZN5doris11UnpackValueILi16ELi25ELb1EEEmPKh
Line
Count
Source
175
664k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
664k
    if (BIT_WIDTH == 0) return 0;
177
178
664k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
664k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
664k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
664k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
664k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
664k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
664k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
664k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
664k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
664k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
664k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
664k
    constexpr bool READ_32_BITS =
202
664k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
664k
    if (READ_32_BITS) {
205
661k
        uint32_t word = in[FIRST_WORD_IDX];
206
661k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
661k
        return word & mask;
208
661k
    }
209
210
2.84k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.84k
    word >>= FIRST_BIT_OFFSET;
212
213
2.84k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.84k
    return word & mask;
220
664k
}
_ZN5doris11UnpackValueILi16ELi26ELb1EEEmPKh
Line
Count
Source
175
664k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
664k
    if (BIT_WIDTH == 0) return 0;
177
178
664k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
664k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
664k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
664k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
664k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
664k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
664k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
664k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
664k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
664k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
664k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
664k
    constexpr bool READ_32_BITS =
202
664k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
664k
    if (READ_32_BITS) {
205
661k
        uint32_t word = in[FIRST_WORD_IDX];
206
661k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
661k
        return word & mask;
208
661k
    }
209
210
2.91k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.91k
    word >>= FIRST_BIT_OFFSET;
212
213
2.91k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.91k
    return word & mask;
220
664k
}
_ZN5doris11UnpackValueILi16ELi27ELb1EEEmPKh
Line
Count
Source
175
663k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
663k
    if (BIT_WIDTH == 0) return 0;
177
178
663k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
663k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
663k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
663k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
663k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
663k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
663k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
663k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
663k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
663k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
663k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
663k
    constexpr bool READ_32_BITS =
202
663k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
663k
    if (READ_32_BITS) {
205
659k
        uint32_t word = in[FIRST_WORD_IDX];
206
659k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
659k
        return word & mask;
208
659k
    }
209
210
3.71k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.71k
    word >>= FIRST_BIT_OFFSET;
212
213
3.71k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.71k
    return word & mask;
220
663k
}
_ZN5doris11UnpackValueILi16ELi28ELb1EEEmPKh
Line
Count
Source
175
663k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
663k
    if (BIT_WIDTH == 0) return 0;
177
178
663k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
663k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
663k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
663k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
663k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
663k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
663k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
663k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
663k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
663k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
663k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
663k
    constexpr bool READ_32_BITS =
202
663k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
663k
    if (READ_32_BITS) {
205
659k
        uint32_t word = in[FIRST_WORD_IDX];
206
659k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
659k
        return word & mask;
208
659k
    }
209
210
3.41k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
3.41k
    word >>= FIRST_BIT_OFFSET;
212
213
3.41k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
3.41k
    return word & mask;
220
663k
}
_ZN5doris11UnpackValueILi16ELi29ELb1EEEmPKh
Line
Count
Source
175
662k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
662k
    if (BIT_WIDTH == 0) return 0;
177
178
662k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
662k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
662k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
662k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
662k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
662k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
662k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
662k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
662k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
662k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
662k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
662k
    constexpr bool READ_32_BITS =
202
662k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
662k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
1.97k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.97k
    word >>= FIRST_BIT_OFFSET;
212
213
1.97k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.97k
    return word & mask;
220
662k
}
_ZN5doris11UnpackValueILi16ELi30ELb1EEEmPKh
Line
Count
Source
175
662k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
662k
    if (BIT_WIDTH == 0) return 0;
177
178
662k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
662k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
662k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
662k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
662k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
662k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
662k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
662k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
662k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
662k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
662k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
662k
    constexpr bool READ_32_BITS =
202
662k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
662k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
2.16k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2.16k
    word >>= FIRST_BIT_OFFSET;
212
213
2.16k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2.16k
    return word & mask;
220
662k
}
_ZN5doris11UnpackValueILi16ELi31ELb1EEEmPKh
Line
Count
Source
175
660k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
660k
    if (BIT_WIDTH == 0) return 0;
177
178
660k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
660k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
660k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
660k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
660k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
660k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
660k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
660k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
660k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
660k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
660k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
660k
    constexpr bool READ_32_BITS =
202
660k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
660k
    if (READ_32_BITS) {
205
660k
        uint32_t word = in[FIRST_WORD_IDX];
206
660k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
660k
        return word & mask;
208
660k
    }
209
210
124
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
124
    word >>= FIRST_BIT_OFFSET;
212
213
124
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
124
    return word & mask;
220
660k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi16ELi23ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi22ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi21ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi20ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi19ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi18ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi17ELb0EEEmPKh
Line
Count
Source
175
46.8k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.8k
    if (BIT_WIDTH == 0) return 0;
177
178
46.8k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.8k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.8k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.8k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.8k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.8k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.8k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.8k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.8k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.8k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.8k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.8k
    constexpr bool READ_32_BITS =
202
46.8k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.8k
    if (READ_32_BITS) {
205
46.8k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.8k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.8k
        return word & mask;
208
46.8k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
46.8k
}
_ZN5doris11UnpackValueILi16ELi16ELb0EEEmPKh
Line
Count
Source
175
46.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
46.9k
    if (BIT_WIDTH == 0) return 0;
177
178
46.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
46.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
46.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
46.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
46.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
46.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
46.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
46.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
46.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
46.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
46.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
46.9k
    constexpr bool READ_32_BITS =
202
46.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
46.9k
    if (READ_32_BITS) {
205
46.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
46.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
46.9k
        return word & mask;
208
46.9k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
46.9k
}
_ZN5doris11UnpackValueILi16ELi15ELb0EEEmPKh
Line
Count
Source
175
48.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.2k
    if (BIT_WIDTH == 0) return 0;
177
178
48.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.2k
    constexpr bool READ_32_BITS =
202
48.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.2k
    if (READ_32_BITS) {
205
48.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.2k
        return word & mask;
208
48.2k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
48.2k
}
_ZN5doris11UnpackValueILi16ELi14ELb0EEEmPKh
Line
Count
Source
175
48.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.2k
    if (BIT_WIDTH == 0) return 0;
177
178
48.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.2k
    constexpr bool READ_32_BITS =
202
48.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.2k
    if (READ_32_BITS) {
205
48.2k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.2k
        return word & mask;
208
48.2k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
48.2k
}
_ZN5doris11UnpackValueILi16ELi13ELb0EEEmPKh
Line
Count
Source
175
48.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.3k
    if (BIT_WIDTH == 0) return 0;
177
178
48.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.3k
    constexpr bool READ_32_BITS =
202
48.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.3k
    if (READ_32_BITS) {
205
48.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.3k
        return word & mask;
208
48.3k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
48.3k
}
_ZN5doris11UnpackValueILi16ELi12ELb0EEEmPKh
Line
Count
Source
175
48.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.3k
    if (BIT_WIDTH == 0) return 0;
177
178
48.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.3k
    constexpr bool READ_32_BITS =
202
48.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.3k
    if (READ_32_BITS) {
205
48.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.3k
        return word & mask;
208
48.3k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
48.3k
}
_ZN5doris11UnpackValueILi16ELi11ELb0EEEmPKh
Line
Count
Source
175
48.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.3k
    if (BIT_WIDTH == 0) return 0;
177
178
48.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.3k
    constexpr bool READ_32_BITS =
202
48.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.3k
    if (READ_32_BITS) {
205
48.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.3k
        return word & mask;
208
48.3k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
48.3k
}
_ZN5doris11UnpackValueILi16ELi10ELb0EEEmPKh
Line
Count
Source
175
48.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.3k
    if (BIT_WIDTH == 0) return 0;
177
178
48.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.3k
    constexpr bool READ_32_BITS =
202
48.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.3k
    if (READ_32_BITS) {
205
48.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.3k
        return word & mask;
208
48.3k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
48.3k
}
_ZN5doris11UnpackValueILi16ELi9ELb0EEEmPKh
Line
Count
Source
175
48.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.3k
    if (BIT_WIDTH == 0) return 0;
177
178
48.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.3k
    constexpr bool READ_32_BITS =
202
48.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.3k
    if (READ_32_BITS) {
205
48.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.3k
        return word & mask;
208
48.3k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
48.3k
}
_ZN5doris11UnpackValueILi16ELi8ELb0EEEmPKh
Line
Count
Source
175
48.3k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
48.3k
    if (BIT_WIDTH == 0) return 0;
177
178
48.3k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
48.3k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
48.3k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
48.3k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
48.3k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
48.3k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
48.3k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
48.3k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
48.3k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
48.3k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
48.3k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
48.3k
    constexpr bool READ_32_BITS =
202
48.3k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
48.3k
    if (READ_32_BITS) {
205
48.3k
        uint32_t word = in[FIRST_WORD_IDX];
206
48.3k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
48.3k
        return word & mask;
208
48.3k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
48.3k
}
_ZN5doris11UnpackValueILi16ELi7ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi6ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi5ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi4ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi3ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi2ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi1ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi16ELi0ELb0EEEmPKh
Line
Count
Source
175
50.0k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
50.0k
    if (BIT_WIDTH == 0) return 0;
177
178
50.0k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
50.0k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
50.0k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
50.0k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
50.0k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
50.0k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
50.0k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
50.0k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
50.0k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
50.0k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
50.0k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
50.0k
    constexpr bool READ_32_BITS =
202
50.0k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
50.0k
    if (READ_32_BITS) {
205
50.0k
        uint32_t word = in[FIRST_WORD_IDX];
206
50.0k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
50.0k
        return word & mask;
208
50.0k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
50.0k
}
_ZN5doris11UnpackValueILi17ELi0ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi1ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi2ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi3ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi4ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi5ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi6ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi7ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi8ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi9ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi10ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi11ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi12ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi13ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi14ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi15ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi16ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi17ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi18ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi19ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi20ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi21ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi22ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi23ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi24ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi25ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi26ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi27ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi28ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi29ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi30ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
1.21M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
1.21M
    word >>= FIRST_BIT_OFFSET;
212
213
1.21M
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
1.21M
    return word & mask;
220
1.21M
}
_ZN5doris11UnpackValueILi17ELi31ELb1EEEmPKh
Line
Count
Source
175
1.21M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
1.21M
    if (BIT_WIDTH == 0) return 0;
177
178
1.21M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
1.21M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
1.21M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
1.21M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
1.21M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
1.21M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
1.21M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
1.21M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
1.21M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
1.21M
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
1.21M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
1.21M
    constexpr bool READ_32_BITS =
202
1.21M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
1.21M
    if (READ_32_BITS) {
205
1.21M
        uint32_t word = in[FIRST_WORD_IDX];
206
1.21M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
1.21M
        return word & mask;
208
1.21M
    }
209
210
554
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
554
    word >>= FIRST_BIT_OFFSET;
212
213
554
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
554
    return word & mask;
220
1.21M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi17ELi23ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi22ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi21ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi20ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi19ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi18ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi17ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi16ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi15ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi14ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
6
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
6
    word >>= FIRST_BIT_OFFSET;
212
213
6
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
6
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi13ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi12ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi11ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi10ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi9ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi8ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi7ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi6ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
18.4E
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi5ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi4ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi3ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi2ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
8
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
8
    word >>= FIRST_BIT_OFFSET;
212
213
8
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
8
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi1ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
82.9k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
82.9k
    word >>= FIRST_BIT_OFFSET;
212
213
82.9k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
82.9k
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi17ELi0ELb0EEEmPKh
Line
Count
Source
175
82.9k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
82.9k
    if (BIT_WIDTH == 0) return 0;
177
178
82.9k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
82.9k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
82.9k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
82.9k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
82.9k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
82.9k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
82.9k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
82.9k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
82.9k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
82.9k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
82.9k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
82.9k
    constexpr bool READ_32_BITS =
202
82.9k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
82.9k
    if (READ_32_BITS) {
205
82.9k
        uint32_t word = in[FIRST_WORD_IDX];
206
82.9k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
82.9k
        return word & mask;
208
82.9k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
82.9k
}
_ZN5doris11UnpackValueILi18ELi0ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi1ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi2ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi3ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi4ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi5ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi6ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi7ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi8ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi9ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi10ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi11ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi12ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi13ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi14ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi15ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi16ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi17ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi18ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi19ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi20ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi21ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi22ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi23ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi24ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi25ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi26ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi27ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi28ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi29ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi30ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
381k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
381k
    word >>= FIRST_BIT_OFFSET;
212
213
381k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
381k
    return word & mask;
220
381k
}
_ZN5doris11UnpackValueILi18ELi31ELb1EEEmPKh
Line
Count
Source
175
381k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
381k
    if (BIT_WIDTH == 0) return 0;
177
178
381k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
381k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
381k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
381k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
381k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
381k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
381k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
381k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
381k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
381k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
381k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
381k
    constexpr bool READ_32_BITS =
202
381k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
381k
    if (READ_32_BITS) {
205
381k
        uint32_t word = in[FIRST_WORD_IDX];
206
381k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
381k
        return word & mask;
208
381k
    }
209
210
18.4E
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
18.4E
    word >>= FIRST_BIT_OFFSET;
212
213
18.4E
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
18.4E
    return word & mask;
220
381k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi18ELi23ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi22ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
4
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
4
    word >>= FIRST_BIT_OFFSET;
212
213
4
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
4
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi21ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi20ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi19ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi18ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
2
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
2
    word >>= FIRST_BIT_OFFSET;
212
213
2
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
2
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi17ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi16ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi15ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi14ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi13ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi12ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi11ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi10ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi9ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
25.6k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.6k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.6k
        return word & mask;
208
25.6k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi8ELb0EEEmPKh
Line
Count
Source
175
25.6k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.6k
    if (BIT_WIDTH == 0) return 0;
177
178
25.6k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.6k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.6k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.6k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.6k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.6k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.6k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.6k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.6k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.6k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.6k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.6k
    constexpr bool READ_32_BITS =
202
25.6k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.6k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.6k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.6k
    word >>= FIRST_BIT_OFFSET;
212
213
25.6k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.6k
    return word & mask;
220
25.6k
}
_ZN5doris11UnpackValueILi18ELi7ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.7k
    word >>= FIRST_BIT_OFFSET;
212
213
25.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.7k
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi6ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
25.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.7k
        return word & mask;
208
25.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi5ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.7k
    word >>= FIRST_BIT_OFFSET;
212
213
25.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.7k
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi4ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
25.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.7k
        return word & mask;
208
25.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi3ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.7k
    word >>= FIRST_BIT_OFFSET;
212
213
25.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.7k
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi2ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
25.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.7k
        return word & mask;
208
25.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi1ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
0
        uint32_t word = in[FIRST_WORD_IDX];
206
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
0
        return word & mask;
208
0
    }
209
210
25.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
25.7k
    word >>= FIRST_BIT_OFFSET;
212
213
25.7k
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
25.7k
    return word & mask;
220
25.7k
}
_ZN5doris11UnpackValueILi18ELi0ELb0EEEmPKh
Line
Count
Source
175
25.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
176
25.7k
    if (BIT_WIDTH == 0) return 0;
177
178
25.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
179
25.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
180
25.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
181
25.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
182
25.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
183
25.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
184
185
25.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
186
25.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
187
25.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
188
189
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
190
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
191
    // enough space in the buffer from the current reading point.
192
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
193
    // is faster.
194
25.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
195
25.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
196
197
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
198
    // necessary) because performance benchmarks show that it is better this way. This seems
199
    // to be due to compiler optimisation issues, so we can revisit it when we update the
200
    // compiler version.
201
25.7k
    constexpr bool READ_32_BITS =
202
25.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
203
204
25.7k
    if (READ_32_BITS) {
205
25.7k
        uint32_t word = in[FIRST_WORD_IDX];
206
25.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
207
25.7k
        return word & mask;
208
25.7k
    }
209
210
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
211
0
    word >>= FIRST_BIT_OFFSET;
212
213
0
    if (WORDS_TO_READ > 2) {
214
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
215
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
216
0
        word |= extra_word << USEFUL_BITS;
217
0
    }
218
219
0
    return word & mask;
220
25.7k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb0EEEmPKh
221
222
template <typename OutType>
223
void DecodeValue(OutType* __restrict__ dict, int64_t dict_len, uint32_t idx,
224
                 OutType* __restrict__ out_val, bool* __restrict__ decode_error) {
225
    if (UNLIKELY(idx >= dict_len)) {
226
        *decode_error = true;
227
    } else {
228
        // Use memcpy() because we can't assume sufficient alignment in some cases (e.g.
229
        // 16 byte decimals).
230
        memcpy(out_val, &dict[idx], sizeof(OutType));
231
    }
232
}
233
234
template <typename OutType, int BIT_WIDTH>
235
const uint8_t* BitPacking::Unpack32Values(const uint8_t* __restrict__ in, int64_t in_bytes,
236
21.5M
                                          OutType* __restrict__ out) {
237
21.5M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
21.5M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
21.5M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
21.5M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
21.5M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
21.5M
#pragma push_macro("UNPACK_VALUE_CALL")
245
21.5M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
688M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
688M
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
21.5M
    return in + BYTES_TO_READ;
250
21.5M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
21.5M
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi0EEEPKhS3_lPT_
_ZN5doris10BitPacking14Unpack32ValuesIjLi1EEEPKhS3_lPT_
Line
Count
Source
236
418k
                                          OutType* __restrict__ out) {
237
418k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
418k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
418k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
418k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
418k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
418k
#pragma push_macro("UNPACK_VALUE_CALL")
245
418k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
418k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
418k
    return in + BYTES_TO_READ;
250
418k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
418k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi2EEEPKhS3_lPT_
Line
Count
Source
236
1.70M
                                          OutType* __restrict__ out) {
237
1.70M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
1.70M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
1.70M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
1.70M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
1.70M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
1.70M
#pragma push_macro("UNPACK_VALUE_CALL")
245
1.70M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
1.70M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
1.70M
    return in + BYTES_TO_READ;
250
1.70M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
1.70M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi3EEEPKhS3_lPT_
Line
Count
Source
236
1.43M
                                          OutType* __restrict__ out) {
237
1.43M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
1.43M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
1.43M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
1.43M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
1.43M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
1.43M
#pragma push_macro("UNPACK_VALUE_CALL")
245
1.43M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
1.43M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
1.43M
    return in + BYTES_TO_READ;
250
1.43M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
1.43M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi4EEEPKhS3_lPT_
Line
Count
Source
236
4.99M
                                          OutType* __restrict__ out) {
237
4.99M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
4.99M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
4.99M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
4.99M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
4.99M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
4.99M
#pragma push_macro("UNPACK_VALUE_CALL")
245
4.99M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
4.99M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
4.99M
    return in + BYTES_TO_READ;
250
4.99M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
4.99M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi5EEEPKhS3_lPT_
Line
Count
Source
236
11.1k
                                          OutType* __restrict__ out) {
237
11.1k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
11.1k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
11.1k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
11.1k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
11.1k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
11.1k
#pragma push_macro("UNPACK_VALUE_CALL")
245
11.1k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
11.1k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
11.1k
    return in + BYTES_TO_READ;
250
11.1k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
11.1k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi6EEEPKhS3_lPT_
Line
Count
Source
236
3.26M
                                          OutType* __restrict__ out) {
237
3.26M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
3.26M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
3.26M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
3.26M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
3.26M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
3.26M
#pragma push_macro("UNPACK_VALUE_CALL")
245
3.26M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
3.26M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
3.26M
    return in + BYTES_TO_READ;
250
3.26M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
3.26M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi7EEEPKhS3_lPT_
Line
Count
Source
236
37.3k
                                          OutType* __restrict__ out) {
237
37.3k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
37.3k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
37.3k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
37.3k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
37.3k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
37.3k
#pragma push_macro("UNPACK_VALUE_CALL")
245
37.3k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
37.3k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
37.3k
    return in + BYTES_TO_READ;
250
37.3k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
37.3k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi8EEEPKhS3_lPT_
Line
Count
Source
236
20.4k
                                          OutType* __restrict__ out) {
237
20.4k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
20.4k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
20.4k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
20.4k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
20.4k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
20.4k
#pragma push_macro("UNPACK_VALUE_CALL")
245
20.4k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
20.4k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
20.4k
    return in + BYTES_TO_READ;
250
20.4k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
20.4k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi9EEEPKhS3_lPT_
Line
Count
Source
236
38.1k
                                          OutType* __restrict__ out) {
237
38.1k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
38.1k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
38.1k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
38.1k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
38.1k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
38.1k
#pragma push_macro("UNPACK_VALUE_CALL")
245
38.1k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
38.1k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
38.1k
    return in + BYTES_TO_READ;
250
38.1k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
38.1k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi10EEEPKhS3_lPT_
Line
Count
Source
236
36.4k
                                          OutType* __restrict__ out) {
237
36.4k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
36.4k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
36.4k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
36.4k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
36.4k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
36.4k
#pragma push_macro("UNPACK_VALUE_CALL")
245
36.4k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
36.4k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
36.4k
    return in + BYTES_TO_READ;
250
36.4k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
36.4k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi11EEEPKhS3_lPT_
Line
Count
Source
236
518k
                                          OutType* __restrict__ out) {
237
518k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
518k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
518k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
518k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
518k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
518k
#pragma push_macro("UNPACK_VALUE_CALL")
245
518k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
518k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
518k
    return in + BYTES_TO_READ;
250
518k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
518k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi12EEEPKhS3_lPT_
Line
Count
Source
236
3.78M
                                          OutType* __restrict__ out) {
237
3.78M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
3.78M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
3.78M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
3.78M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
3.78M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
3.78M
#pragma push_macro("UNPACK_VALUE_CALL")
245
3.78M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
3.78M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
3.78M
    return in + BYTES_TO_READ;
250
3.78M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
3.78M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi13EEEPKhS3_lPT_
Line
Count
Source
236
350k
                                          OutType* __restrict__ out) {
237
350k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
350k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
350k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
350k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
350k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
350k
#pragma push_macro("UNPACK_VALUE_CALL")
245
350k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
350k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
350k
    return in + BYTES_TO_READ;
250
350k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
350k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi14EEEPKhS3_lPT_
Line
Count
Source
236
2.25M
                                          OutType* __restrict__ out) {
237
2.25M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
2.25M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
2.25M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
2.25M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
2.25M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
2.25M
#pragma push_macro("UNPACK_VALUE_CALL")
245
2.25M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
2.25M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
2.25M
    return in + BYTES_TO_READ;
250
2.25M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
2.25M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi15EEEPKhS3_lPT_
Line
Count
Source
236
369k
                                          OutType* __restrict__ out) {
237
369k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
369k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
369k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
369k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
369k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
369k
#pragma push_macro("UNPACK_VALUE_CALL")
245
369k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
369k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
369k
    return in + BYTES_TO_READ;
250
369k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
369k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi16EEEPKhS3_lPT_
Line
Count
Source
236
660k
                                          OutType* __restrict__ out) {
237
660k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
660k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
660k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
660k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
660k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
660k
#pragma push_macro("UNPACK_VALUE_CALL")
245
660k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
660k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
660k
    return in + BYTES_TO_READ;
250
660k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
660k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi17EEEPKhS3_lPT_
Line
Count
Source
236
1.21M
                                          OutType* __restrict__ out) {
237
1.21M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
1.21M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
1.21M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
1.21M
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
1.21M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
1.21M
#pragma push_macro("UNPACK_VALUE_CALL")
245
1.21M
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
1.21M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
1.21M
    return in + BYTES_TO_READ;
250
1.21M
#pragma pop_macro("UNPACK_VALUE_CALL")
251
1.21M
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi18EEEPKhS3_lPT_
Line
Count
Source
236
381k
                                          OutType* __restrict__ out) {
237
381k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
381k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
381k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
381k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
381k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
381k
#pragma push_macro("UNPACK_VALUE_CALL")
245
381k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
381k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
381k
    return in + BYTES_TO_READ;
250
381k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
381k
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi19EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi20EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi21EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi22EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi23EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi24EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi25EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi26EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi27EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi28EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi29EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi30EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi31EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi32EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi33EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi34EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi35EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi36EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi37EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi38EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi39EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi40EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi41EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi42EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi43EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi44EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi45EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi46EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi47EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi48EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi49EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi50EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi51EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi52EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi53EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi54EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi55EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi56EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi57EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi58EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi59EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi60EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi61EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi62EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi63EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi64EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi0EEEPKhS3_lPT_
_ZN5doris10BitPacking14Unpack32ValuesIhLi1EEEPKhS3_lPT_
Line
Count
Source
236
15.9k
                                          OutType* __restrict__ out) {
237
15.9k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
238
15.9k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
239
15.9k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
240
15.9k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
241
15.9k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
242
243
    // Call UnpackValue for 0 <= i < 32.
244
15.9k
#pragma push_macro("UNPACK_VALUE_CALL")
245
15.9k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
246
15.9k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
247
248
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
249
15.9k
    return in + BYTES_TO_READ;
250
15.9k
#pragma pop_macro("UNPACK_VALUE_CALL")
251
15.9k
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi2EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi3EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi4EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi5EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi6EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi7EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi8EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi9EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi10EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi11EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi12EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi13EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi14EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi15EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi16EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi17EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi18EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi19EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi20EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi21EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi22EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi23EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi24EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi25EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi26EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi27EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi28EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi29EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi30EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi31EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi32EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi33EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi34EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi35EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi36EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi37EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi38EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi39EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi40EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi41EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi42EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi43EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi44EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi45EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi46EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi47EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi48EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi49EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi50EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi51EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi52EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi53EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi54EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi55EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi56EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi57EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi58EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi59EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi60EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi61EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi62EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi63EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi64EEEPKhS3_lPT_
252
253
template <typename OutType>
254
const uint8_t* BitPacking::Unpack32Values(int bit_width, const uint8_t* __restrict__ in,
255
                                          int64_t in_bytes, OutType* __restrict__ out) {
256
#pragma push_macro("UNPACK_VALUES_CASE")
257
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
258
    case i:                                     \
259
        return Unpack32Values<OutType, i>(in, in_bytes, out);
260
261
    switch (bit_width) {
262
        // Expand cases from 0 to 64.
263
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
264
    default:
265
        DCHECK(false);
266
        return in;
267
    }
268
#pragma pop_macro("UNPACK_VALUES_CASE")
269
}
270
271
template <typename OutType, int BIT_WIDTH>
272
const uint8_t* BitPacking::UnpackAndDecode32Values(const uint8_t* __restrict__ in, int64_t in_bytes,
273
                                                   OutType* __restrict__ dict, int64_t dict_len,
274
                                                   OutType* __restrict__ out, int64_t stride,
275
                                                   bool* __restrict__ decode_error) {
276
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
277
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
278
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
279
    DCHECK_GE(in_bytes, BYTES_TO_READ);
280
    // TODO: this could be optimised further by using SIMD instructions.
281
    // https://lemire.me/blog/2016/08/25/faster-dictionary-decoding-with-simd-instructions/
282
283
    static_assert(BIT_WIDTH <= MAX_DICT_BITWIDTH, "Too high bit width for dictionary index.");
284
285
    // Call UnpackValue() and DecodeValue() for 0 <= i < 32.
286
#pragma push_macro("DECODE_VALUE_CALL")
287
#define DECODE_VALUE_CALL(ignore1, i, ignore2)                                               \
288
    {                                                                                        \
289
        uint32_t idx = UnpackValue<BIT_WIDTH, i, true>(in);                                  \
290
        uint8_t* out_pos = reinterpret_cast<uint8_t*>(out) + i * stride;                     \
291
        DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
292
    }
293
294
    BOOST_PP_REPEAT_FROM_TO(0, 32, DECODE_VALUE_CALL, ignore);
295
    return in + BYTES_TO_READ;
296
#pragma pop_macro("DECODE_VALUE_CALL")
297
}
298
299
template <typename OutType, int BIT_WIDTH>
300
const uint8_t* BitPacking::UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes,
301
3.10M
                                              int num_values, OutType* __restrict__ out) {
302
3.10M
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
3.10M
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
3.10M
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
3.10M
    constexpr int MAX_BATCH_SIZE = 31;
306
3.10M
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
3.10M
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
3.10M
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
3.10M
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
3.10M
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
3.10M
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
3.10M
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
108k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
108k
        in_buffer = tmp_buffer;
320
108k
    }
321
322
3.10M
#pragma push_macro("UNPACK_VALUES_CASE")
323
3.10M
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
57.4M
    case 31 - i:                                                                              \
325
57.4M
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
57.4M
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
3.10M
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
57.4M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
57.4M
    case 0:
333
3.10M
        break;
334
0
    default:
335
0
        DCHECK(false);
336
3.10M
    }
337
3.10M
    return in + BYTES_TO_READ;
338
3.10M
#pragma pop_macro("UNPACK_VALUES_CASE")
339
3.10M
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi0EEEPKhS3_liPT_
Line
Count
Source
301
2.36k
                                              int num_values, OutType* __restrict__ out) {
302
2.36k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
2.36k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
2.36k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
2.36k
    constexpr int MAX_BATCH_SIZE = 31;
306
2.36k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
2.36k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
2.36k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
2.36k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
2.36k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
2.36k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
2.36k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
0
        in_buffer = tmp_buffer;
320
0
    }
321
322
2.36k
#pragma push_macro("UNPACK_VALUES_CASE")
323
2.36k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
2.36k
    case 31 - i:                                                                              \
325
2.36k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
2.36k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
2.36k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
18.8k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
18.8k
    case 0:
333
2.36k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
2.36k
    }
337
2.36k
    return in + BYTES_TO_READ;
338
2.36k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
2.36k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi1EEEPKhS3_liPT_
Line
Count
Source
301
964k
                                              int num_values, OutType* __restrict__ out) {
302
964k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
964k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
964k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
964k
    constexpr int MAX_BATCH_SIZE = 31;
306
964k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
964k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
964k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
964k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
964k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
964k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
964k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
12.0k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
12.0k
        in_buffer = tmp_buffer;
320
12.0k
    }
321
322
964k
#pragma push_macro("UNPACK_VALUES_CASE")
323
964k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
964k
    case 31 - i:                                                                              \
325
964k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
964k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
964k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
13.2M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
13.2M
    case 0:
333
966k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
964k
    }
337
963k
    return in + BYTES_TO_READ;
338
964k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
964k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi2EEEPKhS3_liPT_
Line
Count
Source
301
647k
                                              int num_values, OutType* __restrict__ out) {
302
647k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
647k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
647k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
647k
    constexpr int MAX_BATCH_SIZE = 31;
306
647k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
647k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
647k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
647k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
647k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
647k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
647k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
15.9k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
15.9k
        in_buffer = tmp_buffer;
320
15.9k
    }
321
322
647k
#pragma push_macro("UNPACK_VALUES_CASE")
323
647k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
647k
    case 31 - i:                                                                              \
325
647k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
647k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
647k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
10.0M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
10.0M
    case 0:
333
649k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
647k
    }
337
647k
    return in + BYTES_TO_READ;
338
647k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
647k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi3EEEPKhS3_liPT_
Line
Count
Source
301
113k
                                              int num_values, OutType* __restrict__ out) {
302
113k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
113k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
113k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
113k
    constexpr int MAX_BATCH_SIZE = 31;
306
113k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
113k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
113k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
113k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
113k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
113k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
113k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
9.13k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
9.13k
        in_buffer = tmp_buffer;
320
9.13k
    }
321
322
113k
#pragma push_macro("UNPACK_VALUES_CASE")
323
113k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
113k
    case 31 - i:                                                                              \
325
113k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
113k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
113k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
2.48M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
2.48M
    case 0:
333
113k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
113k
    }
337
113k
    return in + BYTES_TO_READ;
338
113k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
113k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi4EEEPKhS3_liPT_
Line
Count
Source
301
401k
                                              int num_values, OutType* __restrict__ out) {
302
401k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
401k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
401k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
401k
    constexpr int MAX_BATCH_SIZE = 31;
306
401k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
401k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
401k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
401k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
401k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
401k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
401k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
0
        in_buffer = tmp_buffer;
320
0
    }
321
322
401k
#pragma push_macro("UNPACK_VALUES_CASE")
323
401k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
401k
    case 31 - i:                                                                              \
325
401k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
401k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
401k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
9.38M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
9.38M
    case 0:
333
401k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
401k
    }
337
401k
    return in + BYTES_TO_READ;
338
401k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
401k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi5EEEPKhS3_liPT_
Line
Count
Source
301
1.76k
                                              int num_values, OutType* __restrict__ out) {
302
1.76k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
1.76k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
1.76k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
1.76k
    constexpr int MAX_BATCH_SIZE = 31;
306
1.76k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
1.76k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
1.76k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
1.76k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
1.76k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
1.76k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
1.76k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
1.09k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
1.09k
        in_buffer = tmp_buffer;
320
1.09k
    }
321
322
1.76k
#pragma push_macro("UNPACK_VALUES_CASE")
323
1.76k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
1.76k
    case 31 - i:                                                                              \
325
1.76k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
1.76k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
1.76k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
26.6k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
26.6k
    case 0:
333
1.76k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
1.76k
    }
337
1.76k
    return in + BYTES_TO_READ;
338
1.76k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
1.76k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi6EEEPKhS3_liPT_
Line
Count
Source
301
222k
                                              int num_values, OutType* __restrict__ out) {
302
222k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
222k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
222k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
222k
    constexpr int MAX_BATCH_SIZE = 31;
306
222k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
222k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
222k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
222k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
222k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
222k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
222k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
6.94k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
6.94k
        in_buffer = tmp_buffer;
320
6.94k
    }
321
322
222k
#pragma push_macro("UNPACK_VALUES_CASE")
323
222k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
222k
    case 31 - i:                                                                              \
325
222k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
222k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
222k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
5.31M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
5.31M
    case 0:
333
222k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
222k
    }
337
222k
    return in + BYTES_TO_READ;
338
222k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
222k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi7EEEPKhS3_liPT_
Line
Count
Source
301
7.25k
                                              int num_values, OutType* __restrict__ out) {
302
7.25k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
7.25k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
7.25k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
7.25k
    constexpr int MAX_BATCH_SIZE = 31;
306
7.25k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
7.25k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
7.25k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
7.25k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
7.25k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
7.25k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
7.25k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
5.62k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
5.62k
        in_buffer = tmp_buffer;
320
5.62k
    }
321
322
7.25k
#pragma push_macro("UNPACK_VALUES_CASE")
323
7.25k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
7.25k
    case 31 - i:                                                                              \
325
7.25k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
7.25k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
7.25k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
89.3k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
89.3k
    case 0:
333
7.25k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
7.25k
    }
337
7.25k
    return in + BYTES_TO_READ;
338
7.25k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
7.25k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi8EEEPKhS3_liPT_
Line
Count
Source
301
7.80k
                                              int num_values, OutType* __restrict__ out) {
302
7.80k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
7.80k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
7.80k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
7.80k
    constexpr int MAX_BATCH_SIZE = 31;
306
7.80k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
7.80k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
7.80k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
7.80k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
7.80k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
7.80k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
7.80k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
0
        in_buffer = tmp_buffer;
320
0
    }
321
322
7.80k
#pragma push_macro("UNPACK_VALUES_CASE")
323
7.80k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
7.80k
    case 31 - i:                                                                              \
325
7.80k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
7.80k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
7.80k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
92.2k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
92.2k
    case 0:
333
7.80k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
7.80k
    }
337
7.80k
    return in + BYTES_TO_READ;
338
7.80k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
7.80k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi9EEEPKhS3_liPT_
Line
Count
Source
301
13.1k
                                              int num_values, OutType* __restrict__ out) {
302
13.1k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
13.1k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
13.1k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
13.1k
    constexpr int MAX_BATCH_SIZE = 31;
306
13.1k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
13.1k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
13.1k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
13.1k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
13.1k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
13.1k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
13.1k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
9.68k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
9.68k
        in_buffer = tmp_buffer;
320
9.68k
    }
321
322
13.1k
#pragma push_macro("UNPACK_VALUES_CASE")
323
13.1k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
13.1k
    case 31 - i:                                                                              \
325
13.1k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
13.1k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
13.1k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
159k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
159k
    case 0:
333
13.1k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
13.1k
    }
337
13.1k
    return in + BYTES_TO_READ;
338
13.1k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
13.1k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi10EEEPKhS3_liPT_
Line
Count
Source
301
11.5k
                                              int num_values, OutType* __restrict__ out) {
302
11.5k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
11.5k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
11.5k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
11.5k
    constexpr int MAX_BATCH_SIZE = 31;
306
11.5k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
11.5k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
11.5k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
11.5k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
11.5k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
11.5k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
11.5k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
8.72k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
8.72k
        in_buffer = tmp_buffer;
320
8.72k
    }
321
322
11.5k
#pragma push_macro("UNPACK_VALUES_CASE")
323
11.5k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
11.5k
    case 31 - i:                                                                              \
325
11.5k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
11.5k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
11.5k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
141k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
141k
    case 0:
333
11.5k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
11.5k
    }
337
11.5k
    return in + BYTES_TO_READ;
338
11.5k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
11.5k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi11EEEPKhS3_liPT_
Line
Count
Source
301
40.0k
                                              int num_values, OutType* __restrict__ out) {
302
40.0k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
40.0k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
40.0k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
40.0k
    constexpr int MAX_BATCH_SIZE = 31;
306
40.0k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
40.0k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
40.0k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
40.0k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
40.0k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
40.0k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
40.0k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
4.22k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
4.22k
        in_buffer = tmp_buffer;
320
4.22k
    }
321
322
40.0k
#pragma push_macro("UNPACK_VALUES_CASE")
323
40.0k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
40.0k
    case 31 - i:                                                                              \
325
40.0k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
40.0k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
40.0k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
890k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
890k
    case 0:
333
40.0k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
40.0k
    }
337
40.0k
    return in + BYTES_TO_READ;
338
40.0k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
40.0k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi12EEEPKhS3_liPT_
Line
Count
Source
301
266k
                                              int num_values, OutType* __restrict__ out) {
302
266k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
266k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
266k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
266k
    constexpr int MAX_BATCH_SIZE = 31;
306
266k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
266k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
266k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
266k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
266k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
266k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
266k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
0
        in_buffer = tmp_buffer;
320
0
    }
321
322
266k
#pragma push_macro("UNPACK_VALUES_CASE")
323
266k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
266k
    case 31 - i:                                                                              \
325
266k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
266k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
266k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
6.24M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
6.24M
    case 0:
333
266k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
266k
    }
337
266k
    return in + BYTES_TO_READ;
338
266k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
266k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi13EEEPKhS3_liPT_
Line
Count
Source
301
42.2k
                                              int num_values, OutType* __restrict__ out) {
302
42.2k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
42.2k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
42.2k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
42.2k
    constexpr int MAX_BATCH_SIZE = 31;
306
42.2k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
42.2k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
42.2k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
42.2k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
42.2k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
42.2k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
42.2k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
14.3k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
14.3k
        in_buffer = tmp_buffer;
320
14.3k
    }
321
322
42.2k
#pragma push_macro("UNPACK_VALUES_CASE")
323
42.2k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
42.2k
    case 31 - i:                                                                              \
325
42.2k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
42.2k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
42.2k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
789k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
789k
    case 0:
333
42.2k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
42.2k
    }
337
42.2k
    return in + BYTES_TO_READ;
338
42.2k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
42.2k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi14EEEPKhS3_liPT_
Line
Count
Source
301
162k
                                              int num_values, OutType* __restrict__ out) {
302
162k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
162k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
162k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
162k
    constexpr int MAX_BATCH_SIZE = 31;
306
162k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
162k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
162k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
162k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
162k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
162k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
162k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
10.2k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
10.2k
        in_buffer = tmp_buffer;
320
10.2k
    }
321
322
162k
#pragma push_macro("UNPACK_VALUES_CASE")
323
162k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
162k
    case 31 - i:                                                                              \
325
162k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
162k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
162k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
3.84M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
3.84M
    case 0:
333
162k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
162k
    }
337
162k
    return in + BYTES_TO_READ;
338
162k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
162k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi15EEEPKhS3_liPT_
Line
Count
Source
301
37.6k
                                              int num_values, OutType* __restrict__ out) {
302
37.6k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
37.6k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
37.6k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
37.6k
    constexpr int MAX_BATCH_SIZE = 31;
306
37.6k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
37.6k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
37.6k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
37.6k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
37.6k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
37.6k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
37.6k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
3.48k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
3.48k
        in_buffer = tmp_buffer;
320
3.48k
    }
321
322
37.6k
#pragma push_macro("UNPACK_VALUES_CASE")
323
37.6k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
37.6k
    case 31 - i:                                                                              \
325
37.6k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
37.6k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
37.6k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
802k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
802k
    case 0:
333
37.6k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
37.6k
    }
337
37.5k
    return in + BYTES_TO_READ;
338
37.6k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
37.6k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi16EEEPKhS3_liPT_
Line
Count
Source
301
49.9k
                                              int num_values, OutType* __restrict__ out) {
302
49.9k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
49.9k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
49.9k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
49.9k
    constexpr int MAX_BATCH_SIZE = 31;
306
49.9k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
49.9k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
49.9k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
49.9k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
49.9k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
49.9k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
49.9k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
0
        in_buffer = tmp_buffer;
320
0
    }
321
322
49.9k
#pragma push_macro("UNPACK_VALUES_CASE")
323
49.9k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
49.9k
    case 31 - i:                                                                              \
325
49.9k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
49.9k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
49.9k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
1.15M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
1.15M
    case 0:
333
49.9k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
49.9k
    }
337
50.0k
    return in + BYTES_TO_READ;
338
49.9k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
49.9k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi17EEEPKhS3_liPT_
Line
Count
Source
301
82.8k
                                              int num_values, OutType* __restrict__ out) {
302
82.8k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
82.8k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
82.8k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
82.8k
    constexpr int MAX_BATCH_SIZE = 31;
306
82.8k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
82.8k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
82.8k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
82.8k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
82.8k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
82.8k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
82.8k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
2.08k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
2.08k
        in_buffer = tmp_buffer;
320
2.08k
    }
321
322
82.8k
#pragma push_macro("UNPACK_VALUES_CASE")
323
82.8k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
82.8k
    case 31 - i:                                                                              \
325
82.8k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
82.8k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
82.8k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
1.99M
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
1.99M
    case 0:
333
82.9k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
82.8k
    }
337
82.9k
    return in + BYTES_TO_READ;
338
82.8k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
82.8k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi18EEEPKhS3_liPT_
Line
Count
Source
301
25.7k
                                              int num_values, OutType* __restrict__ out) {
302
25.7k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
25.7k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
25.7k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
25.7k
    constexpr int MAX_BATCH_SIZE = 31;
306
25.7k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
25.7k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
25.7k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
25.7k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
25.7k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
25.7k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
25.7k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
658
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
658
        in_buffer = tmp_buffer;
320
658
    }
321
322
25.7k
#pragma push_macro("UNPACK_VALUES_CASE")
323
25.7k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
25.7k
    case 31 - i:                                                                              \
325
25.7k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
25.7k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
25.7k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
616k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
616k
    case 0:
333
25.7k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
25.7k
    }
337
25.7k
    return in + BYTES_TO_READ;
338
25.7k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
25.7k
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi19EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi20EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi21EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi22EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi23EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi24EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi25EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi26EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi27EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi28EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi29EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi30EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi31EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi32EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi33EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi34EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi35EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi36EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi37EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi38EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi39EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi40EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi41EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi42EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi43EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi44EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi45EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi46EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi47EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi48EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi49EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi50EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi51EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi52EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi53EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi54EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi55EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi56EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi57EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi58EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi59EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi60EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi61EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi62EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi63EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi64EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi0EEEPKhS3_liPT_
_ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi1EEEPKhS3_liPT_
Line
Count
Source
301
4.46k
                                              int num_values, OutType* __restrict__ out) {
302
4.46k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
303
4.46k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
304
4.46k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
305
4.46k
    constexpr int MAX_BATCH_SIZE = 31;
306
4.46k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
307
4.46k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
308
4.46k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
309
310
    // Make sure the buffer is at least 1 byte.
311
4.46k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
312
4.46k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
313
314
4.46k
    const uint8_t* in_buffer = in;
315
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
316
    // last 32-bit load would go past the end of the buffer.
317
4.46k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
318
4.46k
        memcpy(tmp_buffer, in, BYTES_TO_READ);
319
4.46k
        in_buffer = tmp_buffer;
320
4.46k
    }
321
322
4.46k
#pragma push_macro("UNPACK_VALUES_CASE")
323
4.46k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
324
4.46k
    case 31 - i:                                                                              \
325
4.46k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
326
4.46k
        [[fallthrough]];
327
328
    // Use switch with fall-through cases to minimise branching.
329
4.46k
    switch (num_values) {
330
        // Expand cases from 31 down to 1.
331
38.5k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
332
38.5k
    case 0:
333
4.46k
        break;
334
0
    default:
335
0
        DCHECK(false);
336
4.46k
    }
337
4.46k
    return in + BYTES_TO_READ;
338
4.46k
#pragma pop_macro("UNPACK_VALUES_CASE")
339
4.46k
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi2EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi3EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi4EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi5EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi6EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi7EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi8EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi9EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi10EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi11EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi12EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi13EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi14EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi15EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi16EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi17EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi18EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi19EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi20EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi21EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi22EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi23EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi24EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi25EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi26EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi27EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi28EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi29EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi30EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi31EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi32EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi33EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi34EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi35EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi36EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi37EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi38EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi39EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi40EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi41EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi42EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi43EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi44EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi45EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi46EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi47EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi48EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi49EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi50EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi51EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi52EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi53EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi54EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi55EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi56EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi57EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi58EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi59EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi60EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi61EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi62EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi63EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi64EEEPKhS3_liPT_
340
341
template <typename OutType, int BIT_WIDTH>
342
const uint8_t* BitPacking::UnpackAndDecodeUpTo31Values(const uint8_t* __restrict__ in,
343
                                                       int64_t in_bytes, OutType* __restrict__ dict,
344
                                                       int64_t dict_len, int num_values,
345
                                                       OutType* __restrict__ out, int64_t stride,
346
                                                       bool* __restrict__ decode_error) {
347
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
348
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
349
    constexpr int MAX_BATCH_SIZE = 31;
350
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
351
    DCHECK_GE(in_bytes, BYTES_TO_READ);
352
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
353
354
    // Make sure the buffer is at least 1 byte.
355
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
356
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
357
358
    const uint8_t* in_buffer = in;
359
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
360
    // last 32-bit load would go past the end of the buffer.
361
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
362
        memcpy(tmp_buffer, in, BYTES_TO_READ);
363
        in_buffer = tmp_buffer;
364
    }
365
366
#pragma push_macro("DECODE_VALUES_CASE")
367
#define DECODE_VALUES_CASE(ignore1, i, ignore2)                                              \
368
    case 31 - i: {                                                                           \
369
        uint32_t idx = UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer);                     \
370
        uint8_t* out_pos = reinterpret_cast<uint8_t*>(out) + (30 - i) * stride;              \
371
        DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
372
    }
373
374
    // Use switch with fall-through cases to minimise branching.
375
    switch (num_values) {
376
        // Expand cases from 31 down to 1.
377
        BOOST_PP_REPEAT_FROM_TO(0, 31, DECODE_VALUES_CASE, ignore);
378
    case 0:
379
        break;
380
    default:
381
        DCHECK(false);
382
    }
383
    return in + BYTES_TO_READ;
384
#pragma pop_macro("DECODE_VALUES_CASE")
385
}
386
} // namespace doris