Coverage Report

Created: 2026-04-14 17:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/frame_of_reference_coding.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/frame_of_reference_coding.h"
19
20
#include <glog/logging.h>
21
#include <sys/types.h>
22
23
#include <algorithm>
24
#include <cstring>
25
#include <iostream>
26
#include <iterator>
27
#include <limits>
28
29
#include "common/cast_set.h"
30
#include "exec/common/endian.h"
31
#include "util/bit_util.h"
32
#include "util/coding.h"
33
34
namespace doris {
35
36
template <typename T>
37
4.17M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
38
4.17M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
39
4.17M
    _buffered_values_num += count;
40
4.17M
    p_data += count;
41
4.17M
    return p_data;
42
4.17M
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10copy_valueEPKam
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10copy_valueEPKsm
_ZN5doris10ForEncoderIiE10copy_valueEPKim
Line
Count
Source
37
8
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
38
8
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
39
8
    _buffered_values_num += count;
40
8
    p_data += count;
41
8
    return p_data;
42
8
}
_ZN5doris10ForEncoderIlE10copy_valueEPKlm
Line
Count
Source
37
2.08M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
38
2.08M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
39
2.08M
    _buffered_values_num += count;
40
2.08M
    p_data += count;
41
2.08M
    return p_data;
42
2.08M
}
_ZN5doris10ForEncoderInE10copy_valueEPKnm
Line
Count
Source
37
2.08M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
38
2.08M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
39
2.08M
    _buffered_values_num += count;
40
2.08M
    p_data += count;
41
2.08M
    return p_data;
42
2.08M
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10copy_valueEPKhm
Unexecuted instantiation: _ZN5doris10ForEncoderItE10copy_valueEPKtm
_ZN5doris10ForEncoderIjE10copy_valueEPKjm
Line
Count
Source
37
3
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
38
3
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
39
3
    _buffered_values_num += count;
40
3
    p_data += count;
41
3
    return p_data;
42
3
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE10copy_valueEPKmm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10copy_valueEPKS1_m
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10copy_valueEPKom
43
44
template <typename T>
45
4.17M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
46
4.17M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
47
4.16M
        copy_value(in_data, count);
48
4.16M
        _values_num += count;
49
4.16M
        return;
50
4.16M
    }
51
52
    // 1. padding one frame
53
16.3k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
54
16.3k
    in_data = copy_value(in_data, padding_num);
55
16.3k
    bit_packing_one_frame_value(_buffered_values);
56
57
    // 2. process frame by frame
58
16.3k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
59
16.4k
    for (size_t i = 0; i < frame_size; i++) {
60
        // directly encode value to the bit_writer, don't buffer the value
61
8
        _buffered_values_num = FRAME_VALUE_NUM;
62
8
        bit_packing_one_frame_value(in_data);
63
8
        in_data += FRAME_VALUE_NUM;
64
8
    }
65
66
    // 3. process remaining value
67
16.3k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
68
16.3k
    if (remaining_num > 0) {
69
4
        copy_value(in_data, remaining_num);
70
4
    }
71
72
16.3k
    _values_num += count;
73
16.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE9put_batchEPKam
Unexecuted instantiation: _ZN5doris10ForEncoderIsE9put_batchEPKsm
_ZN5doris10ForEncoderIiE9put_batchEPKim
Line
Count
Source
45
7
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
46
7
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
47
4
        copy_value(in_data, count);
48
4
        _values_num += count;
49
4
        return;
50
4
    }
51
52
    // 1. padding one frame
53
3
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
54
3
    in_data = copy_value(in_data, padding_num);
55
3
    bit_packing_one_frame_value(_buffered_values);
56
57
    // 2. process frame by frame
58
3
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
59
5
    for (size_t i = 0; i < frame_size; i++) {
60
        // directly encode value to the bit_writer, don't buffer the value
61
2
        _buffered_values_num = FRAME_VALUE_NUM;
62
2
        bit_packing_one_frame_value(in_data);
63
2
        in_data += FRAME_VALUE_NUM;
64
2
    }
65
66
    // 3. process remaining value
67
3
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
68
3
    if (remaining_num > 0) {
69
1
        copy_value(in_data, remaining_num);
70
1
    }
71
72
3
    _values_num += count;
73
3
}
_ZN5doris10ForEncoderIlE9put_batchEPKlm
Line
Count
Source
45
2.08M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
46
2.08M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
47
2.08M
        copy_value(in_data, count);
48
2.08M
        _values_num += count;
49
2.08M
        return;
50
2.08M
    }
51
52
    // 1. padding one frame
53
8.19k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
54
8.19k
    in_data = copy_value(in_data, padding_num);
55
8.19k
    bit_packing_one_frame_value(_buffered_values);
56
57
    // 2. process frame by frame
58
8.19k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
59
8.19k
    for (size_t i = 0; i < frame_size; i++) {
60
        // directly encode value to the bit_writer, don't buffer the value
61
3
        _buffered_values_num = FRAME_VALUE_NUM;
62
3
        bit_packing_one_frame_value(in_data);
63
3
        in_data += FRAME_VALUE_NUM;
64
3
    }
65
66
    // 3. process remaining value
67
8.19k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
68
8.19k
    if (remaining_num > 0) {
69
3
        copy_value(in_data, remaining_num);
70
3
    }
71
72
8.19k
    _values_num += count;
73
8.19k
}
_ZN5doris10ForEncoderInE9put_batchEPKnm
Line
Count
Source
45
2.08M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
46
2.08M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
47
2.08M
        copy_value(in_data, count);
48
2.08M
        _values_num += count;
49
2.08M
        return;
50
2.08M
    }
51
52
    // 1. padding one frame
53
8.19k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
54
8.19k
    in_data = copy_value(in_data, padding_num);
55
8.19k
    bit_packing_one_frame_value(_buffered_values);
56
57
    // 2. process frame by frame
58
8.19k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
59
8.19k
    for (size_t i = 0; i < frame_size; i++) {
60
        // directly encode value to the bit_writer, don't buffer the value
61
0
        _buffered_values_num = FRAME_VALUE_NUM;
62
0
        bit_packing_one_frame_value(in_data);
63
0
        in_data += FRAME_VALUE_NUM;
64
0
    }
65
66
    // 3. process remaining value
67
8.19k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
68
8.19k
    if (remaining_num > 0) {
69
0
        copy_value(in_data, remaining_num);
70
0
    }
71
72
8.19k
    _values_num += count;
73
8.19k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE9put_batchEPKhm
Unexecuted instantiation: _ZN5doris10ForEncoderItE9put_batchEPKtm
_ZN5doris10ForEncoderIjE9put_batchEPKjm
Line
Count
Source
45
3
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
46
3
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
47
0
        copy_value(in_data, count);
48
0
        _values_num += count;
49
0
        return;
50
0
    }
51
52
    // 1. padding one frame
53
3
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
54
3
    in_data = copy_value(in_data, padding_num);
55
3
    bit_packing_one_frame_value(_buffered_values);
56
57
    // 2. process frame by frame
58
3
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
59
6
    for (size_t i = 0; i < frame_size; i++) {
60
        // directly encode value to the bit_writer, don't buffer the value
61
3
        _buffered_values_num = FRAME_VALUE_NUM;
62
3
        bit_packing_one_frame_value(in_data);
63
3
        in_data += FRAME_VALUE_NUM;
64
3
    }
65
66
    // 3. process remaining value
67
3
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
68
3
    if (remaining_num > 0) {
69
0
        copy_value(in_data, remaining_num);
70
0
    }
71
72
3
    _values_num += count;
73
3
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE9put_batchEPKmm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE9put_batchEPKS1_m
Unexecuted instantiation: _ZN5doris10ForEncoderIoE9put_batchEPKom
74
75
// todo(kks): improve this method by SIMD instructions
76
77
template <typename T>
78
15.3k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
79
15.3k
    int64_t s = 0;
80
15.3k
    uint8_t output_mask = 255;
81
15.3k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
82
15.3k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
83
84
237k
    for (int i = 0; i < full_batch_size; i += 8) {
85
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
86
222k
        s |= static_cast<int64_t>(input[i + 7]);
87
222k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
88
222k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
89
222k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
90
222k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
91
222k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
92
222k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
93
222k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
94
95
        // Starting with the highest valid bit, take out 8 bits in sequence
96
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
97
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
98
1.22M
        for (int j = 0; j < bit_width; j++) {
99
1.00M
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
100
1.00M
        }
101
222k
        output += bit_width;
102
222k
        s = 0;
103
222k
    }
104
105
    // remainder
106
15.3k
    int byte = tail_count * bit_width; // How many bits are left to store
107
15.3k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
108
109
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
110
65.4k
    for (int i = 0; i < tail_count; i++) {
111
50.1k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
112
50.1k
             << ((tail_count - i - 1) * bit_width);
113
50.1k
    }
114
115
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
116
15.3k
    s <<= (bytes << 3) - byte;
117
118
    // Starting with the highest valid bit, take out 8 bits in sequence
119
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
120
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
121
48.4k
    for (int i = 0; i < bytes; i++) {
122
33.1k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
123
33.1k
    }
124
15.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_8EPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_8EPKshiPh
_ZN5doris10ForEncoderIiE10bit_pack_8EPKihiPh
Line
Count
Source
78
8
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
79
8
    int64_t s = 0;
80
8
    uint8_t output_mask = 255;
81
8
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
82
8
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
83
84
104
    for (int i = 0; i < full_batch_size; i += 8) {
85
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
86
96
        s |= static_cast<int64_t>(input[i + 7]);
87
96
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
88
96
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
89
96
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
90
96
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
91
96
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
92
96
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
93
96
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
94
95
        // Starting with the highest valid bit, take out 8 bits in sequence
96
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
97
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
98
192
        for (int j = 0; j < bit_width; j++) {
99
96
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
100
96
        }
101
96
        output += bit_width;
102
96
        s = 0;
103
96
    }
104
105
    // remainder
106
8
    int byte = tail_count * bit_width; // How many bits are left to store
107
8
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
108
109
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
110
10
    for (int i = 0; i < tail_count; i++) {
111
2
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
112
2
             << ((tail_count - i - 1) * bit_width);
113
2
    }
114
115
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
116
8
    s <<= (bytes << 3) - byte;
117
118
    // Starting with the highest valid bit, take out 8 bits in sequence
119
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
120
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
121
9
    for (int i = 0; i < bytes; i++) {
122
1
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
123
1
    }
124
8
}
_ZN5doris10ForEncoderIlE10bit_pack_8EPKlhiPh
Line
Count
Source
78
3.05k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
79
3.05k
    int64_t s = 0;
80
3.05k
    uint8_t output_mask = 255;
81
3.05k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
82
3.05k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
83
84
34.9k
    for (int i = 0; i < full_batch_size; i += 8) {
85
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
86
31.8k
        s |= static_cast<int64_t>(input[i + 7]);
87
31.8k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
88
31.8k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
89
31.8k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
90
31.8k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
91
31.8k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
92
31.8k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
93
31.8k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
94
95
        // Starting with the highest valid bit, take out 8 bits in sequence
96
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
97
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
98
174k
        for (int j = 0; j < bit_width; j++) {
99
142k
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
100
142k
        }
101
31.8k
        output += bit_width;
102
31.8k
        s = 0;
103
31.8k
    }
104
105
    // remainder
106
3.05k
    int byte = tail_count * bit_width; // How many bits are left to store
107
3.05k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
108
109
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
110
10.2k
    for (int i = 0; i < tail_count; i++) {
111
7.16k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
112
7.16k
             << ((tail_count - i - 1) * bit_width);
113
7.16k
    }
114
115
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
116
3.05k
    s <<= (bytes << 3) - byte;
117
118
    // Starting with the highest valid bit, take out 8 bits in sequence
119
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
120
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
121
7.77k
    for (int i = 0; i < bytes; i++) {
122
4.72k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
123
4.72k
    }
124
3.05k
}
_ZN5doris10ForEncoderInE10bit_pack_8EPKnhiPh
Line
Count
Source
78
12.2k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
79
12.2k
    int64_t s = 0;
80
12.2k
    uint8_t output_mask = 255;
81
12.2k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
82
12.2k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
83
84
202k
    for (int i = 0; i < full_batch_size; i += 8) {
85
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
86
190k
        s |= static_cast<int64_t>(input[i + 7]);
87
190k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
88
190k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
89
190k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
90
190k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
91
190k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
92
190k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
93
190k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
94
95
        // Starting with the highest valid bit, take out 8 bits in sequence
96
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
97
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
98
1.04M
        for (int j = 0; j < bit_width; j++) {
99
857k
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
100
857k
        }
101
190k
        output += bit_width;
102
190k
        s = 0;
103
190k
    }
104
105
    // remainder
106
12.2k
    int byte = tail_count * bit_width; // How many bits are left to store
107
12.2k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
108
109
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
110
55.2k
    for (int i = 0; i < tail_count; i++) {
111
43.0k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
112
43.0k
             << ((tail_count - i - 1) * bit_width);
113
43.0k
    }
114
115
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
116
12.2k
    s <<= (bytes << 3) - byte;
117
118
    // Starting with the highest valid bit, take out 8 bits in sequence
119
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
120
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
121
40.6k
    for (int i = 0; i < bytes; i++) {
122
28.4k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
123
28.4k
    }
124
12.2k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_8EPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_8EPKthiPh
_ZN5doris10ForEncoderIjE10bit_pack_8EPKjhiPh
Line
Count
Source
78
6
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
79
6
    int64_t s = 0;
80
6
    uint8_t output_mask = 255;
81
6
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
82
6
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
83
84
102
    for (int i = 0; i < full_batch_size; i += 8) {
85
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
86
96
        s |= static_cast<int64_t>(input[i + 7]);
87
96
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
88
96
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
89
96
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
90
96
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
91
96
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
92
96
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
93
96
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
94
95
        // Starting with the highest valid bit, take out 8 bits in sequence
96
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
97
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
98
192
        for (int j = 0; j < bit_width; j++) {
99
96
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
100
96
        }
101
96
        output += bit_width;
102
96
        s = 0;
103
96
    }
104
105
    // remainder
106
6
    int byte = tail_count * bit_width; // How many bits are left to store
107
6
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
108
109
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
110
6
    for (int i = 0; i < tail_count; i++) {
111
0
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
112
0
             << ((tail_count - i - 1) * bit_width);
113
0
    }
114
115
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
116
6
    s <<= (bytes << 3) - byte;
117
118
    // Starting with the highest valid bit, take out 8 bits in sequence
119
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
120
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
121
6
    for (int i = 0; i < bytes; i++) {
122
0
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
123
0
    }
124
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_8EPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_8EPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_8EPKohiPh
125
126
template <typename T>
127
template <typename U>
128
45.8k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
129
45.8k
    U s = 0;
130
45.8k
    uint8_t output_mask = 255;
131
45.8k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
132
45.8k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
133
45.8k
    int output_size = 0;                      // How many outputs can be processed at a time
134
45.8k
    int bit_width_remainder =
135
45.8k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
136
45.8k
    int extra_bit = 0;            // Extra bits after each process
137
138
1.40M
    for (int i = 0; i < full_batch_size; i += 4) {
139
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
140
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
141
1.35M
        s <<= bit_width;
142
1.35M
        s |= (static_cast<U>(input[i]));
143
1.35M
        s <<= bit_width;
144
1.35M
        s |= (static_cast<U>(input[i + 1]));
145
1.35M
        s <<= bit_width;
146
1.35M
        s |= (static_cast<U>(input[i + 2]));
147
1.35M
        s <<= bit_width;
148
1.35M
        s |= (static_cast<U>(input[i + 3]));
149
150
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
151
        // and there are extra_bit bits left over from the last loop,
152
        // divide by 8 to calculate how much output can be processed in this loop.
153
1.35M
        output_size = ((bit_width << 2) + extra_bit) >> 3;
154
155
        // Each loop will leave bit_width_remainder bit unprocessed,
156
        // last loop will leave extra_bit bit, eventually will leave
157
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
158
1.35M
        extra_bit = (extra_bit + bit_width_remainder) & 7;
159
160
        // Starting with the highest valid bit, take out 8 bits in sequence
161
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
162
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
163
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
164
15.2M
        for (int j = 0; j < output_size; j++) {
165
13.8M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
166
13.8M
        }
167
1.35M
        output += output_size;
168
169
        // s retains the post extra_bit bit as it is not processed
170
1.35M
        s &= (1 << extra_bit) - 1;
171
1.35M
    }
172
173
    // remainder
174
45.8k
    int byte = tail_count * bit_width;     // How many bits are left to store
175
45.8k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
176
45.8k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
177
178
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
179
110k
    for (int i = 0; i < tail_count; i++) {
180
64.4k
        s <<= bit_width;
181
64.4k
        s |= (input[i + full_batch_size]);
182
64.4k
    }
183
184
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
185
45.8k
    s <<= (bytes << 3) - byte;
186
187
    // Starting with the highest valid bit, take out 8 bits in sequence
188
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
189
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
190
231k
    for (int i = 0; i < bytes; i++) {
191
185k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
192
185k
    }
193
45.8k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4IlEEvPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4InEEvPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4IlEEvPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4InEEvPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4IlEEvPKihiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4InEEvPKihiPh
_ZN5doris10ForEncoderIlE10bit_pack_4IlEEvPKlhiPh
Line
Count
Source
128
3.03k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
129
3.03k
    U s = 0;
130
3.03k
    uint8_t output_mask = 255;
131
3.03k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
132
3.03k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
133
3.03k
    int output_size = 0;                      // How many outputs can be processed at a time
134
3.03k
    int bit_width_remainder =
135
3.03k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
136
3.03k
    int extra_bit = 0;            // Extra bits after each process
137
138
67.5k
    for (int i = 0; i < full_batch_size; i += 4) {
139
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
140
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
141
64.5k
        s <<= bit_width;
142
64.5k
        s |= (static_cast<U>(input[i]));
143
64.5k
        s <<= bit_width;
144
64.5k
        s |= (static_cast<U>(input[i + 1]));
145
64.5k
        s <<= bit_width;
146
64.5k
        s |= (static_cast<U>(input[i + 2]));
147
64.5k
        s <<= bit_width;
148
64.5k
        s |= (static_cast<U>(input[i + 3]));
149
150
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
151
        // and there are extra_bit bits left over from the last loop,
152
        // divide by 8 to calculate how much output can be processed in this loop.
153
64.5k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
154
155
        // Each loop will leave bit_width_remainder bit unprocessed,
156
        // last loop will leave extra_bit bit, eventually will leave
157
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
158
64.5k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
159
160
        // Starting with the highest valid bit, take out 8 bits in sequence
161
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
162
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
163
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
164
467k
        for (int j = 0; j < output_size; j++) {
165
402k
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
166
402k
        }
167
64.5k
        output += output_size;
168
169
        // s retains the post extra_bit bit as it is not processed
170
64.5k
        s &= (1 << extra_bit) - 1;
171
64.5k
    }
172
173
    // remainder
174
3.03k
    int byte = tail_count * bit_width;     // How many bits are left to store
175
3.03k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
176
3.03k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
177
178
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
179
6.08k
    for (int i = 0; i < tail_count; i++) {
180
3.04k
        s <<= bit_width;
181
3.04k
        s |= (input[i + full_batch_size]);
182
3.04k
    }
183
184
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
185
3.03k
    s <<= (bytes << 3) - byte;
186
187
    // Starting with the highest valid bit, take out 8 bits in sequence
188
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
189
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
190
8.75k
    for (int i = 0; i < bytes; i++) {
191
5.71k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
192
5.71k
    }
193
3.03k
}
_ZN5doris10ForEncoderIlE10bit_pack_4InEEvPKlhiPh
Line
Count
Source
128
6.08k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
129
6.08k
    U s = 0;
130
6.08k
    uint8_t output_mask = 255;
131
6.08k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
132
6.08k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
133
6.08k
    int output_size = 0;                      // How many outputs can be processed at a time
134
6.08k
    int bit_width_remainder =
135
6.08k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
136
6.08k
    int extra_bit = 0;            // Extra bits after each process
137
138
135k
    for (int i = 0; i < full_batch_size; i += 4) {
139
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
140
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
141
129k
        s <<= bit_width;
142
129k
        s |= (static_cast<U>(input[i]));
143
129k
        s <<= bit_width;
144
129k
        s |= (static_cast<U>(input[i + 1]));
145
129k
        s <<= bit_width;
146
129k
        s |= (static_cast<U>(input[i + 2]));
147
129k
        s <<= bit_width;
148
129k
        s |= (static_cast<U>(input[i + 3]));
149
150
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
151
        // and there are extra_bit bits left over from the last loop,
152
        // divide by 8 to calculate how much output can be processed in this loop.
153
129k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
154
155
        // Each loop will leave bit_width_remainder bit unprocessed,
156
        // last loop will leave extra_bit bit, eventually will leave
157
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
158
129k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
159
160
        // Starting with the highest valid bit, take out 8 bits in sequence
161
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
162
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
163
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
164
1.70M
        for (int j = 0; j < output_size; j++) {
165
1.58M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
166
1.58M
        }
167
129k
        output += output_size;
168
169
        // s retains the post extra_bit bit as it is not processed
170
129k
        s &= (1 << extra_bit) - 1;
171
129k
    }
172
173
    // remainder
174
6.08k
    int byte = tail_count * bit_width;     // How many bits are left to store
175
6.08k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
176
6.08k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
177
178
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
179
12.2k
    for (int i = 0; i < tail_count; i++) {
180
6.12k
        s <<= bit_width;
181
6.12k
        s |= (input[i + full_batch_size]);
182
6.12k
    }
183
184
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
185
6.08k
    s <<= (bytes << 3) - byte;
186
187
    // Starting with the highest valid bit, take out 8 bits in sequence
188
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
189
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
190
26.7k
    for (int i = 0; i < bytes; i++) {
191
20.6k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
192
20.6k
    }
193
6.08k
}
_ZN5doris10ForEncoderInE10bit_pack_4IlEEvPKnhiPh
Line
Count
Source
128
12.2k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
129
12.2k
    U s = 0;
130
12.2k
    uint8_t output_mask = 255;
131
12.2k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
132
12.2k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
133
12.2k
    int output_size = 0;                      // How many outputs can be processed at a time
134
12.2k
    int bit_width_remainder =
135
12.2k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
136
12.2k
    int extra_bit = 0;            // Extra bits after each process
137
138
399k
    for (int i = 0; i < full_batch_size; i += 4) {
139
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
140
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
141
387k
        s <<= bit_width;
142
387k
        s |= (static_cast<U>(input[i]));
143
387k
        s <<= bit_width;
144
387k
        s |= (static_cast<U>(input[i + 1]));
145
387k
        s <<= bit_width;
146
387k
        s |= (static_cast<U>(input[i + 2]));
147
387k
        s <<= bit_width;
148
387k
        s |= (static_cast<U>(input[i + 3]));
149
150
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
151
        // and there are extra_bit bits left over from the last loop,
152
        // divide by 8 to calculate how much output can be processed in this loop.
153
387k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
154
155
        // Each loop will leave bit_width_remainder bit unprocessed,
156
        // last loop will leave extra_bit bit, eventually will leave
157
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
158
387k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
159
160
        // Starting with the highest valid bit, take out 8 bits in sequence
161
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
162
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
163
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
164
2.80M
        for (int j = 0; j < output_size; j++) {
165
2.41M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
166
2.41M
        }
167
387k
        output += output_size;
168
169
        // s retains the post extra_bit bit as it is not processed
170
387k
        s &= (1 << extra_bit) - 1;
171
387k
    }
172
173
    // remainder
174
12.2k
    int byte = tail_count * bit_width;     // How many bits are left to store
175
12.2k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
176
12.2k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
177
178
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
179
30.6k
    for (int i = 0; i < tail_count; i++) {
180
18.4k
        s <<= bit_width;
181
18.4k
        s |= (input[i + full_batch_size]);
182
18.4k
    }
183
184
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
185
12.2k
    s <<= (bytes << 3) - byte;
186
187
    // Starting with the highest valid bit, take out 8 bits in sequence
188
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
189
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
190
46.8k
    for (int i = 0; i < bytes; i++) {
191
34.5k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
192
34.5k
    }
193
12.2k
}
_ZN5doris10ForEncoderInE10bit_pack_4InEEvPKnhiPh
Line
Count
Source
128
24.4k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
129
24.4k
    U s = 0;
130
24.4k
    uint8_t output_mask = 255;
131
24.4k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
132
24.4k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
133
24.4k
    int output_size = 0;                      // How many outputs can be processed at a time
134
24.4k
    int bit_width_remainder =
135
24.4k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
136
24.4k
    int extra_bit = 0;            // Extra bits after each process
137
138
798k
    for (int i = 0; i < full_batch_size; i += 4) {
139
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
140
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
141
774k
        s <<= bit_width;
142
774k
        s |= (static_cast<U>(input[i]));
143
774k
        s <<= bit_width;
144
774k
        s |= (static_cast<U>(input[i + 1]));
145
774k
        s <<= bit_width;
146
774k
        s |= (static_cast<U>(input[i + 2]));
147
774k
        s <<= bit_width;
148
774k
        s |= (static_cast<U>(input[i + 3]));
149
150
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
151
        // and there are extra_bit bits left over from the last loop,
152
        // divide by 8 to calculate how much output can be processed in this loop.
153
774k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
154
155
        // Each loop will leave bit_width_remainder bit unprocessed,
156
        // last loop will leave extra_bit bit, eventually will leave
157
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
158
774k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
159
160
        // Starting with the highest valid bit, take out 8 bits in sequence
161
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
162
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
163
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
164
10.2M
        for (int j = 0; j < output_size; j++) {
165
9.48M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
166
9.48M
        }
167
774k
        output += output_size;
168
169
        // s retains the post extra_bit bit as it is not processed
170
774k
        s &= (1 << extra_bit) - 1;
171
774k
    }
172
173
    // remainder
174
24.4k
    int byte = tail_count * bit_width;     // How many bits are left to store
175
24.4k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
176
24.4k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
177
178
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
179
61.3k
    for (int i = 0; i < tail_count; i++) {
180
36.8k
        s <<= bit_width;
181
36.8k
        s |= (input[i + full_batch_size]);
182
36.8k
    }
183
184
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
185
24.4k
    s <<= (bytes << 3) - byte;
186
187
    // Starting with the highest valid bit, take out 8 bits in sequence
188
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
189
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
190
148k
    for (int i = 0; i < bytes; i++) {
191
124k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
192
124k
    }
193
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4IlEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4InEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4IlEEvPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4InEEvPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4IlEEvPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4InEEvPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4IlEEvPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4InEEvPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4IlEEvPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4InEEvPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4IlEEvPKohiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4InEEvPKohiPh
194
195
template <typename T>
196
181k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
197
181k
    int output_mask = 255;
198
181k
    int need_bit = 0; // still need
199
200
21.9M
    for (int i = 0; i < in_num; i++) {
201
21.7M
        T x = input[i];
202
21.7M
        int width = bit_width;
203
21.7M
        if (need_bit) {
204
            // The last time we take away the high 8 - need_bit,
205
            // we need to make up the rest of the need_bit from the width.
206
            // Use width - need_bit to compute high need_bit bits
207
15.0M
            *output |= x >> (width - need_bit);
208
15.0M
            output++;
209
            // There are need_bit bits being used, so subtract
210
15.0M
            width -= need_bit;
211
15.0M
        }
212
21.7M
        int num = width >> 3;      // How many outputs can be processed at a time
213
21.7M
        int remainder = width & 7; // How many bits are left to store
214
215
        // Starting with the highest valid bit, take out 8 bits in sequence
216
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
217
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
218
        // But since there are still remainder bits that can't be processed, need to add the remainder
219
223M
        for (int j = 0; j < num; j++) {
220
202M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
221
202M
            output++;
222
202M
        }
223
21.7M
        if (remainder) {
224
            // Process the last remaining remainder bit.
225
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
226
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
227
15.1M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
228
            // Already have remainder bits, next time need 8-remainder bits
229
15.1M
            need_bit = 8 - remainder;
230
15.1M
        } else {
231
6.57M
            need_bit = 0;
232
6.57M
        }
233
21.7M
    }
234
181k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_1EPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_1EPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_1EPKihiPh
_ZN5doris10ForEncoderIlE10bit_pack_1EPKlhiPh
Line
Count
Source
196
12.1k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
197
12.1k
    int output_mask = 255;
198
12.1k
    int need_bit = 0; // still need
199
200
1.05M
    for (int i = 0; i < in_num; i++) {
201
1.04M
        T x = input[i];
202
1.04M
        int width = bit_width;
203
1.04M
        if (need_bit) {
204
            // The last time we take away the high 8 - need_bit,
205
            // we need to make up the rest of the need_bit from the width.
206
            // Use width - need_bit to compute high need_bit bits
207
743k
            *output |= x >> (width - need_bit);
208
743k
            output++;
209
            // There are need_bit bits being used, so subtract
210
743k
            width -= need_bit;
211
743k
        }
212
1.04M
        int num = width >> 3;      // How many outputs can be processed at a time
213
1.04M
        int remainder = width & 7; // How many bits are left to store
214
215
        // Starting with the highest valid bit, take out 8 bits in sequence
216
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
217
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
218
        // But since there are still remainder bits that can't be processed, need to add the remainder
219
6.62M
        for (int j = 0; j < num; j++) {
220
5.58M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
221
5.58M
            output++;
222
5.58M
        }
223
1.04M
        if (remainder) {
224
            // Process the last remaining remainder bit.
225
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
226
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
227
749k
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
228
            // Already have remainder bits, next time need 8-remainder bits
229
749k
            need_bit = 8 - remainder;
230
749k
        } else {
231
294k
            need_bit = 0;
232
294k
        }
233
1.04M
    }
234
12.1k
}
_ZN5doris10ForEncoderInE10bit_pack_1EPKnhiPh
Line
Count
Source
196
169k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
197
169k
    int output_mask = 255;
198
169k
    int need_bit = 0; // still need
199
200
20.8M
    for (int i = 0; i < in_num; i++) {
201
20.6M
        T x = input[i];
202
20.6M
        int width = bit_width;
203
20.6M
        if (need_bit) {
204
            // The last time we take away the high 8 - need_bit,
205
            // we need to make up the rest of the need_bit from the width.
206
            // Use width - need_bit to compute high need_bit bits
207
14.3M
            *output |= x >> (width - need_bit);
208
14.3M
            output++;
209
            // There are need_bit bits being used, so subtract
210
14.3M
            width -= need_bit;
211
14.3M
        }
212
20.6M
        int num = width >> 3;      // How many outputs can be processed at a time
213
20.6M
        int remainder = width & 7; // How many bits are left to store
214
215
        // Starting with the highest valid bit, take out 8 bits in sequence
216
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
217
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
218
        // But since there are still remainder bits that can't be processed, need to add the remainder
219
217M
        for (int j = 0; j < num; j++) {
220
196M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
221
196M
            output++;
222
196M
        }
223
20.6M
        if (remainder) {
224
            // Process the last remaining remainder bit.
225
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
226
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
227
14.4M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
228
            // Already have remainder bits, next time need 8-remainder bits
229
14.4M
            need_bit = 8 - remainder;
230
14.4M
        } else {
231
6.27M
            need_bit = 0;
232
6.27M
        }
233
20.6M
    }
234
169k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_1EPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_1EPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_1EPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_1EPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_1EPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_1EPKohiPh
235
236
// Use as few bit as possible to store a piece of integer data.
237
// param[in] input: the integer list need to pack
238
// param[in] in_num: the number integer need to pack
239
// param[in] bit_width: how many bit we use to store each integer data
240
// param[out] out: the packed result
241
242
// For example:
243
// The input is int32 list: 1, 2, 4, 8 and bit_width is 4
244
// The output will be: 0001 0010 0100 1000
245
template <typename T>
246
243k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
247
243k
    if (in_num == 0 || bit_width == 0) {
248
260
        return;
249
260
    }
250
    /*
251
        bit_width <= 8 : pack_8 > pack_16 > pack_32
252
        bit_width <= 16 : pack_4 > pack_8 > pack_16
253
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
254
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
255
        bit_width <= 64 : pack_1 > pack_4
256
    */
257
242k
    if (bit_width <= 8) {
258
15.3k
        bit_pack_8(input, in_num, bit_width, output);
259
227k
    } else if (bit_width <= 16) {
260
15.2k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
261
212k
    } else if (bit_width <= 32) {
262
30.5k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
263
181k
    } else {
264
181k
        bit_pack_1(input, in_num, bit_width, output);
265
181k
    }
266
242k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE8bit_packEPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE8bit_packEPKshiPh
_ZN5doris10ForEncoderIiE8bit_packEPKihiPh
Line
Count
Source
246
9
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
247
9
    if (in_num == 0 || bit_width == 0) {
248
1
        return;
249
1
    }
250
    /*
251
        bit_width <= 8 : pack_8 > pack_16 > pack_32
252
        bit_width <= 16 : pack_4 > pack_8 > pack_16
253
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
254
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
255
        bit_width <= 64 : pack_1 > pack_4
256
    */
257
8
    if (bit_width <= 8) {
258
8
        bit_pack_8(input, in_num, bit_width, output);
259
8
    } else if (bit_width <= 16) {
260
0
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
261
0
    } else if (bit_width <= 32) {
262
0
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
263
0
    } else {
264
0
        bit_pack_1(input, in_num, bit_width, output);
265
0
    }
266
8
}
_ZN5doris10ForEncoderIlE8bit_packEPKlhiPh
Line
Count
Source
246
24.4k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
247
24.4k
    if (in_num == 0 || bit_width == 0) {
248
131
        return;
249
131
    }
250
    /*
251
        bit_width <= 8 : pack_8 > pack_16 > pack_32
252
        bit_width <= 16 : pack_4 > pack_8 > pack_16
253
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
254
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
255
        bit_width <= 64 : pack_1 > pack_4
256
    */
257
24.3k
    if (bit_width <= 8) {
258
3.05k
        bit_pack_8(input, in_num, bit_width, output);
259
21.2k
    } else if (bit_width <= 16) {
260
3.03k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
261
18.2k
    } else if (bit_width <= 32) {
262
6.08k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
263
12.1k
    } else {
264
12.1k
        bit_pack_1(input, in_num, bit_width, output);
265
12.1k
    }
266
24.3k
}
_ZN5doris10ForEncoderInE8bit_packEPKnhiPh
Line
Count
Source
246
218k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
247
218k
    if (in_num == 0 || bit_width == 0) {
248
128
        return;
249
128
    }
250
    /*
251
        bit_width <= 8 : pack_8 > pack_16 > pack_32
252
        bit_width <= 16 : pack_4 > pack_8 > pack_16
253
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
254
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
255
        bit_width <= 64 : pack_1 > pack_4
256
    */
257
218k
    if (bit_width <= 8) {
258
12.2k
        bit_pack_8(input, in_num, bit_width, output);
259
206k
    } else if (bit_width <= 16) {
260
12.2k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
261
194k
    } else if (bit_width <= 32) {
262
24.4k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
263
169k
    } else {
264
169k
        bit_pack_1(input, in_num, bit_width, output);
265
169k
    }
266
218k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE8bit_packEPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE8bit_packEPKthiPh
_ZN5doris10ForEncoderIjE8bit_packEPKjhiPh
Line
Count
Source
246
6
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
247
6
    if (in_num == 0 || bit_width == 0) {
248
0
        return;
249
0
    }
250
    /*
251
        bit_width <= 8 : pack_8 > pack_16 > pack_32
252
        bit_width <= 16 : pack_4 > pack_8 > pack_16
253
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
254
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
255
        bit_width <= 64 : pack_1 > pack_4
256
    */
257
6
    if (bit_width <= 8) {
258
6
        bit_pack_8(input, in_num, bit_width, output);
259
6
    } else if (bit_width <= 16) {
260
0
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
261
0
    } else if (bit_width <= 32) {
262
0
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
263
0
    } else {
264
0
        bit_pack_1(input, in_num, bit_width, output);
265
0
    }
266
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE8bit_packEPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE8bit_packEPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE8bit_packEPKohiPh
267
268
template <typename T>
269
48.9k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
270
48.9k
    T min = input[0];
271
48.9k
    T max = input[0];
272
48.9k
    bool is_ascending = true;
273
48.9k
    uint8_t bit_width = 0;
274
48.9k
    T half_max_delta = numeric_limits_max() >> 1;
275
48.9k
    bool is_keep_original_value = false;
276
277
    // 1. make sure order_flag, save_original_value, and find max&min.
278
4.18M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
279
4.13M
        if (is_ascending) {
280
86.4k
            if (input[i] < input[i - 1]) {
281
48.4k
                is_ascending = false;
282
48.4k
            } else {
283
38.0k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
284
0
                    is_keep_original_value = true;
285
38.0k
                } else {
286
38.0k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
287
38.0k
                }
288
38.0k
            }
289
86.4k
        }
290
291
4.13M
        if (input[i] < min) {
292
180k
            min = input[i];
293
180k
            continue;
294
180k
        }
295
296
3.95M
        if (input[i] > max) {
297
183k
            max = input[i];
298
183k
        }
299
3.95M
    }
300
48.9k
    if (!is_ascending) {
301
48.4k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
302
0
            is_keep_original_value = true;
303
0
        }
304
48.4k
    }
305
306
    // 2. save min value.
307
48.9k
    if (sizeof(T) == 16) {
308
24.4k
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
309
24.4k
    } else if (sizeof(T) == 8) {
310
24.4k
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
311
24.4k
    } else {
312
15
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
313
15
    }
314
315
    // 3.1 save original value.
316
48.9k
    if (is_keep_original_value) {
317
0
        bit_width = sizeof(T) * 8;
318
0
        uint32_t len = _buffered_values_num * bit_width;
319
0
        _buffer->reserve(_buffer->size() + len);
320
0
        size_t origin_size = _buffer->size();
321
0
        _buffer->resize(origin_size + len);
322
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
323
48.9k
    } else {
324
        // 3.2 bit pack.
325
        // improve for ascending order input, we could use fewer bit
326
48.9k
        T delta_values[FRAME_VALUE_NUM];
327
48.9k
        if (is_ascending) {
328
449
            delta_values[0] = 0;
329
3.17k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
330
2.72k
                delta_values[i] = input[i] - input[i - 1];
331
2.72k
            }
332
48.4k
        } else {
333
48.4k
            bit_width = bits(static_cast<T>(max - min));
334
4.22M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
335
4.17M
                delta_values[i] = input[i] - min;
336
4.17M
            }
337
48.4k
        }
338
339
48.9k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
340
341
48.9k
        _buffer->reserve(_buffer->size() + packing_len);
342
48.9k
        size_t origin_size = _buffer->size();
343
48.9k
        _buffer->resize(origin_size + packing_len);
344
48.9k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
345
48.9k
    }
346
48.9k
    uint8_t storage_format = 0;
347
48.9k
    if (is_keep_original_value) {
348
0
        storage_format = 2;
349
48.9k
    } else if (is_ascending) {
350
449
        storage_format = 1;
351
449
    }
352
48.9k
    _storage_formats.push_back(storage_format);
353
48.9k
    _bit_widths.push_back(bit_width);
354
355
48.9k
    _buffered_values_num = 0;
356
48.9k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE27bit_packing_one_frame_valueEPKa
Unexecuted instantiation: _ZN5doris10ForEncoderIsE27bit_packing_one_frame_valueEPKs
_ZN5doris10ForEncoderIiE27bit_packing_one_frame_valueEPKi
Line
Count
Source
269
9
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
270
9
    T min = input[0];
271
9
    T max = input[0];
272
9
    bool is_ascending = true;
273
9
    uint8_t bit_width = 0;
274
9
    T half_max_delta = numeric_limits_max() >> 1;
275
9
    bool is_keep_original_value = false;
276
277
    // 1. make sure order_flag, save_original_value, and find max&min.
278
771
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
279
762
        if (is_ascending) {
280
762
            if (input[i] < input[i - 1]) {
281
0
                is_ascending = false;
282
762
            } else {
283
762
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
284
0
                    is_keep_original_value = true;
285
762
                } else {
286
762
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
287
762
                }
288
762
            }
289
762
        }
290
291
762
        if (input[i] < min) {
292
0
            min = input[i];
293
0
            continue;
294
0
        }
295
296
762
        if (input[i] > max) {
297
762
            max = input[i];
298
762
        }
299
762
    }
300
9
    if (!is_ascending) {
301
0
        if ((max >> 1) - (min >> 1) > half_max_delta) {
302
0
            is_keep_original_value = true;
303
0
        }
304
0
    }
305
306
    // 2. save min value.
307
9
    if (sizeof(T) == 16) {
308
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
309
9
    } else if (sizeof(T) == 8) {
310
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
311
9
    } else {
312
9
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
313
9
    }
314
315
    // 3.1 save original value.
316
9
    if (is_keep_original_value) {
317
0
        bit_width = sizeof(T) * 8;
318
0
        uint32_t len = _buffered_values_num * bit_width;
319
0
        _buffer->reserve(_buffer->size() + len);
320
0
        size_t origin_size = _buffer->size();
321
0
        _buffer->resize(origin_size + len);
322
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
323
9
    } else {
324
        // 3.2 bit pack.
325
        // improve for ascending order input, we could use fewer bit
326
9
        T delta_values[FRAME_VALUE_NUM];
327
9
        if (is_ascending) {
328
9
            delta_values[0] = 0;
329
771
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
330
762
                delta_values[i] = input[i] - input[i - 1];
331
762
            }
332
9
        } else {
333
0
            bit_width = bits(static_cast<T>(max - min));
334
0
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
335
0
                delta_values[i] = input[i] - min;
336
0
            }
337
0
        }
338
339
9
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
340
341
9
        _buffer->reserve(_buffer->size() + packing_len);
342
9
        size_t origin_size = _buffer->size();
343
9
        _buffer->resize(origin_size + packing_len);
344
9
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
345
9
    }
346
9
    uint8_t storage_format = 0;
347
9
    if (is_keep_original_value) {
348
0
        storage_format = 2;
349
9
    } else if (is_ascending) {
350
9
        storage_format = 1;
351
9
    }
352
9
    _storage_formats.push_back(storage_format);
353
9
    _bit_widths.push_back(bit_width);
354
355
9
    _buffered_values_num = 0;
356
9
}
_ZN5doris10ForEncoderIlE27bit_packing_one_frame_valueEPKl
Line
Count
Source
269
24.4k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
270
24.4k
    T min = input[0];
271
24.4k
    T max = input[0];
272
24.4k
    bool is_ascending = true;
273
24.4k
    uint8_t bit_width = 0;
274
24.4k
    T half_max_delta = numeric_limits_max() >> 1;
275
24.4k
    bool is_keep_original_value = false;
276
277
    // 1. make sure order_flag, save_original_value, and find max&min.
278
2.08M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
279
2.06M
        if (is_ascending) {
280
43.3k
            if (input[i] < input[i - 1]) {
281
24.2k
                is_ascending = false;
282
24.2k
            } else {
283
19.1k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
284
0
                    is_keep_original_value = true;
285
19.1k
                } else {
286
19.1k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
287
19.1k
                }
288
19.1k
            }
289
43.3k
        }
290
291
2.06M
        if (input[i] < min) {
292
88.0k
            min = input[i];
293
88.0k
            continue;
294
88.0k
        }
295
296
1.97M
        if (input[i] > max) {
297
89.5k
            max = input[i];
298
89.5k
        }
299
1.97M
    }
300
24.4k
    if (!is_ascending) {
301
24.2k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
302
0
            is_keep_original_value = true;
303
0
        }
304
24.2k
    }
305
306
    // 2. save min value.
307
24.4k
    if (sizeof(T) == 16) {
308
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
309
24.4k
    } else if (sizeof(T) == 8) {
310
24.4k
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
311
24.4k
    } else {
312
0
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
313
0
    }
314
315
    // 3.1 save original value.
316
24.4k
    if (is_keep_original_value) {
317
0
        bit_width = sizeof(T) * 8;
318
0
        uint32_t len = _buffered_values_num * bit_width;
319
0
        _buffer->reserve(_buffer->size() + len);
320
0
        size_t origin_size = _buffer->size();
321
0
        _buffer->resize(origin_size + len);
322
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
323
24.4k
    } else {
324
        // 3.2 bit pack.
325
        // improve for ascending order input, we could use fewer bit
326
24.4k
        T delta_values[FRAME_VALUE_NUM];
327
24.4k
        if (is_ascending) {
328
220
            delta_values[0] = 0;
329
1.29k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
330
1.07k
                delta_values[i] = input[i] - input[i - 1];
331
1.07k
            }
332
24.2k
        } else {
333
24.2k
            bit_width = bits(static_cast<T>(max - min));
334
2.11M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
335
2.08M
                delta_values[i] = input[i] - min;
336
2.08M
            }
337
24.2k
        }
338
339
24.4k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
340
341
24.4k
        _buffer->reserve(_buffer->size() + packing_len);
342
24.4k
        size_t origin_size = _buffer->size();
343
24.4k
        _buffer->resize(origin_size + packing_len);
344
24.4k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
345
24.4k
    }
346
24.4k
    uint8_t storage_format = 0;
347
24.4k
    if (is_keep_original_value) {
348
0
        storage_format = 2;
349
24.4k
    } else if (is_ascending) {
350
220
        storage_format = 1;
351
220
    }
352
24.4k
    _storage_formats.push_back(storage_format);
353
24.4k
    _bit_widths.push_back(bit_width);
354
355
24.4k
    _buffered_values_num = 0;
356
24.4k
}
_ZN5doris10ForEncoderInE27bit_packing_one_frame_valueEPKn
Line
Count
Source
269
24.4k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
270
24.4k
    T min = input[0];
271
24.4k
    T max = input[0];
272
24.4k
    bool is_ascending = true;
273
24.4k
    uint8_t bit_width = 0;
274
24.4k
    T half_max_delta = numeric_limits_max() >> 1;
275
24.4k
    bool is_keep_original_value = false;
276
277
    // 1. make sure order_flag, save_original_value, and find max&min.
278
2.08M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
279
2.06M
        if (is_ascending) {
280
41.5k
            if (input[i] < input[i - 1]) {
281
24.2k
                is_ascending = false;
282
24.2k
            } else {
283
17.3k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
284
0
                    is_keep_original_value = true;
285
17.3k
                } else {
286
17.3k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
287
17.3k
                }
288
17.3k
            }
289
41.5k
        }
290
291
2.06M
        if (input[i] < min) {
292
92.7k
            min = input[i];
293
92.7k
            continue;
294
92.7k
        }
295
296
1.97M
        if (input[i] > max) {
297
92.5k
            max = input[i];
298
92.5k
        }
299
1.97M
    }
300
24.4k
    if (!is_ascending) {
301
24.2k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
302
0
            is_keep_original_value = true;
303
0
        }
304
24.2k
    }
305
306
    // 2. save min value.
307
24.4k
    if (sizeof(T) == 16) {
308
24.4k
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
309
24.4k
    } else if (sizeof(T) == 8) {
310
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
311
0
    } else {
312
0
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
313
0
    }
314
315
    // 3.1 save original value.
316
24.4k
    if (is_keep_original_value) {
317
0
        bit_width = sizeof(T) * 8;
318
0
        uint32_t len = _buffered_values_num * bit_width;
319
0
        _buffer->reserve(_buffer->size() + len);
320
0
        size_t origin_size = _buffer->size();
321
0
        _buffer->resize(origin_size + len);
322
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
323
24.4k
    } else {
324
        // 3.2 bit pack.
325
        // improve for ascending order input, we could use fewer bit
326
24.4k
        T delta_values[FRAME_VALUE_NUM];
327
24.4k
        if (is_ascending) {
328
214
            delta_values[0] = 0;
329
338
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
330
124
                delta_values[i] = input[i] - input[i - 1];
331
124
            }
332
24.2k
        } else {
333
24.2k
            bit_width = bits(static_cast<T>(max - min));
334
2.11M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
335
2.08M
                delta_values[i] = input[i] - min;
336
2.08M
            }
337
24.2k
        }
338
339
24.4k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
340
341
24.4k
        _buffer->reserve(_buffer->size() + packing_len);
342
24.4k
        size_t origin_size = _buffer->size();
343
24.4k
        _buffer->resize(origin_size + packing_len);
344
24.4k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
345
24.4k
    }
346
24.4k
    uint8_t storage_format = 0;
347
24.4k
    if (is_keep_original_value) {
348
0
        storage_format = 2;
349
24.4k
    } else if (is_ascending) {
350
214
        storage_format = 1;
351
214
    }
352
24.4k
    _storage_formats.push_back(storage_format);
353
24.4k
    _bit_widths.push_back(bit_width);
354
355
24.4k
    _buffered_values_num = 0;
356
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE27bit_packing_one_frame_valueEPKh
Unexecuted instantiation: _ZN5doris10ForEncoderItE27bit_packing_one_frame_valueEPKt
_ZN5doris10ForEncoderIjE27bit_packing_one_frame_valueEPKj
Line
Count
Source
269
6
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
270
6
    T min = input[0];
271
6
    T max = input[0];
272
6
    bool is_ascending = true;
273
6
    uint8_t bit_width = 0;
274
6
    T half_max_delta = numeric_limits_max() >> 1;
275
6
    bool is_keep_original_value = false;
276
277
    // 1. make sure order_flag, save_original_value, and find max&min.
278
768
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
279
762
        if (is_ascending) {
280
762
            if (input[i] < input[i - 1]) {
281
0
                is_ascending = false;
282
762
            } else {
283
762
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
284
0
                    is_keep_original_value = true;
285
762
                } else {
286
762
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
287
762
                }
288
762
            }
289
762
        }
290
291
762
        if (input[i] < min) {
292
0
            min = input[i];
293
0
            continue;
294
0
        }
295
296
762
        if (input[i] > max) {
297
762
            max = input[i];
298
762
        }
299
762
    }
300
6
    if (!is_ascending) {
301
0
        if ((max >> 1) - (min >> 1) > half_max_delta) {
302
0
            is_keep_original_value = true;
303
0
        }
304
0
    }
305
306
    // 2. save min value.
307
6
    if (sizeof(T) == 16) {
308
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
309
6
    } else if (sizeof(T) == 8) {
310
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
311
6
    } else {
312
6
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
313
6
    }
314
315
    // 3.1 save original value.
316
6
    if (is_keep_original_value) {
317
0
        bit_width = sizeof(T) * 8;
318
0
        uint32_t len = _buffered_values_num * bit_width;
319
0
        _buffer->reserve(_buffer->size() + len);
320
0
        size_t origin_size = _buffer->size();
321
0
        _buffer->resize(origin_size + len);
322
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
323
6
    } else {
324
        // 3.2 bit pack.
325
        // improve for ascending order input, we could use fewer bit
326
6
        T delta_values[FRAME_VALUE_NUM];
327
6
        if (is_ascending) {
328
6
            delta_values[0] = 0;
329
768
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
330
762
                delta_values[i] = input[i] - input[i - 1];
331
762
            }
332
6
        } else {
333
0
            bit_width = bits(static_cast<T>(max - min));
334
0
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
335
0
                delta_values[i] = input[i] - min;
336
0
            }
337
0
        }
338
339
6
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
340
341
6
        _buffer->reserve(_buffer->size() + packing_len);
342
6
        size_t origin_size = _buffer->size();
343
6
        _buffer->resize(origin_size + packing_len);
344
6
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
345
6
    }
346
6
    uint8_t storage_format = 0;
347
6
    if (is_keep_original_value) {
348
0
        storage_format = 2;
349
6
    } else if (is_ascending) {
350
6
        storage_format = 1;
351
6
    }
352
6
    _storage_formats.push_back(storage_format);
353
6
    _bit_widths.push_back(bit_width);
354
355
6
    _buffered_values_num = 0;
356
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE27bit_packing_one_frame_valueEPKm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE27bit_packing_one_frame_valueEPKS1_
Unexecuted instantiation: _ZN5doris10ForEncoderIoE27bit_packing_one_frame_valueEPKo
357
358
template <typename T>
359
32.6k
uint32_t ForEncoder<T>::flush() {
360
32.6k
    if (_buffered_values_num != 0) {
361
32.5k
        bit_packing_one_frame_value(_buffered_values);
362
32.5k
    }
363
364
    // write the footer:
365
    // 1 _storage_formats and bit_widths
366
32.6k
    DCHECK(_storage_formats.size() == _bit_widths.size())
367
0
            << "Size of _storage_formats and _bit_widths should be equal.";
368
81.5k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
369
48.9k
        _buffer->append(&_storage_formats[i], 1);
370
48.9k
        _buffer->append(&_bit_widths[i], 1);
371
48.9k
    }
372
    // 2 frame_value_num and values_num
373
32.6k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
374
32.6k
    _buffer->append(&frame_value_num, 1);
375
32.6k
    put_fixed32_le(_buffer, _values_num);
376
377
32.6k
    return cast_set<uint32_t>(_buffer->size());
378
32.6k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderIsE5flushEv
_ZN5doris10ForEncoderIiE5flushEv
Line
Count
Source
359
7
uint32_t ForEncoder<T>::flush() {
360
7
    if (_buffered_values_num != 0) {
361
4
        bit_packing_one_frame_value(_buffered_values);
362
4
    }
363
364
    // write the footer:
365
    // 1 _storage_formats and bit_widths
366
7
    DCHECK(_storage_formats.size() == _bit_widths.size())
367
0
            << "Size of _storage_formats and _bit_widths should be equal.";
368
16
    for (size_t i = 0; i < _storage_formats.size(); i++) {
369
9
        _buffer->append(&_storage_formats[i], 1);
370
9
        _buffer->append(&_bit_widths[i], 1);
371
9
    }
372
    // 2 frame_value_num and values_num
373
7
    uint8_t frame_value_num = FRAME_VALUE_NUM;
374
7
    _buffer->append(&frame_value_num, 1);
375
7
    put_fixed32_le(_buffer, _values_num);
376
377
7
    return cast_set<uint32_t>(_buffer->size());
378
7
}
_ZN5doris10ForEncoderIlE5flushEv
Line
Count
Source
359
16.3k
uint32_t ForEncoder<T>::flush() {
360
16.3k
    if (_buffered_values_num != 0) {
361
16.2k
        bit_packing_one_frame_value(_buffered_values);
362
16.2k
    }
363
364
    // write the footer:
365
    // 1 _storage_formats and bit_widths
366
16.3k
    DCHECK(_storage_formats.size() == _bit_widths.size())
367
0
            << "Size of _storage_formats and _bit_widths should be equal.";
368
40.7k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
369
24.4k
        _buffer->append(&_storage_formats[i], 1);
370
24.4k
        _buffer->append(&_bit_widths[i], 1);
371
24.4k
    }
372
    // 2 frame_value_num and values_num
373
16.3k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
374
16.3k
    _buffer->append(&frame_value_num, 1);
375
16.3k
    put_fixed32_le(_buffer, _values_num);
376
377
16.3k
    return cast_set<uint32_t>(_buffer->size());
378
16.3k
}
_ZN5doris10ForEncoderInE5flushEv
Line
Count
Source
359
16.3k
uint32_t ForEncoder<T>::flush() {
360
16.3k
    if (_buffered_values_num != 0) {
361
16.2k
        bit_packing_one_frame_value(_buffered_values);
362
16.2k
    }
363
364
    // write the footer:
365
    // 1 _storage_formats and bit_widths
366
16.3k
    DCHECK(_storage_formats.size() == _bit_widths.size())
367
0
            << "Size of _storage_formats and _bit_widths should be equal.";
368
40.7k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
369
24.4k
        _buffer->append(&_storage_formats[i], 1);
370
24.4k
        _buffer->append(&_bit_widths[i], 1);
371
24.4k
    }
372
    // 2 frame_value_num and values_num
373
16.3k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
374
16.3k
    _buffer->append(&frame_value_num, 1);
375
16.3k
    put_fixed32_le(_buffer, _values_num);
376
377
16.3k
    return cast_set<uint32_t>(_buffer->size());
378
16.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderItE5flushEv
_ZN5doris10ForEncoderIjE5flushEv
Line
Count
Source
359
3
uint32_t ForEncoder<T>::flush() {
360
3
    if (_buffered_values_num != 0) {
361
0
        bit_packing_one_frame_value(_buffered_values);
362
0
    }
363
364
    // write the footer:
365
    // 1 _storage_formats and bit_widths
366
3
    DCHECK(_storage_formats.size() == _bit_widths.size())
367
0
            << "Size of _storage_formats and _bit_widths should be equal.";
368
9
    for (size_t i = 0; i < _storage_formats.size(); i++) {
369
6
        _buffer->append(&_storage_formats[i], 1);
370
6
        _buffer->append(&_bit_widths[i], 1);
371
6
    }
372
    // 2 frame_value_num and values_num
373
3
    uint8_t frame_value_num = FRAME_VALUE_NUM;
374
3
    _buffer->append(&frame_value_num, 1);
375
3
    put_fixed32_le(_buffer, _values_num);
376
377
3
    return cast_set<uint32_t>(_buffer->size());
378
3
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderIoE5flushEv
379
380
template <typename T>
381
48.9k
const T ForEncoder<T>::numeric_limits_max() {
382
48.9k
    return std::numeric_limits<T>::max();
383
48.9k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderIsE18numeric_limits_maxEv
_ZN5doris10ForEncoderIiE18numeric_limits_maxEv
Line
Count
Source
381
9
const T ForEncoder<T>::numeric_limits_max() {
382
9
    return std::numeric_limits<T>::max();
383
9
}
_ZN5doris10ForEncoderIlE18numeric_limits_maxEv
Line
Count
Source
381
24.4k
const T ForEncoder<T>::numeric_limits_max() {
382
24.4k
    return std::numeric_limits<T>::max();
383
24.4k
}
_ZN5doris10ForEncoderInE18numeric_limits_maxEv
Line
Count
Source
381
24.4k
const T ForEncoder<T>::numeric_limits_max() {
382
24.4k
    return std::numeric_limits<T>::max();
383
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderItE18numeric_limits_maxEv
_ZN5doris10ForEncoderIjE18numeric_limits_maxEv
Line
Count
Source
381
6
const T ForEncoder<T>::numeric_limits_max() {
382
6
    return std::numeric_limits<T>::max();
383
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderIoE18numeric_limits_maxEv
384
385
template <>
386
0
const uint24_t ForEncoder<uint24_t>::numeric_limits_max() {
387
0
    return 0XFFFFFF;
388
0
}
389
390
template <typename T>
391
32.6k
bool ForDecoder<T>::init() {
392
    // When row count is zero, the minimum footer size is 5:
393
    // only has ValuesNum(4) + FrameValueNum(1)
394
32.6k
    if (_buffer_len < 5) {
395
0
        return false;
396
0
    }
397
398
32.6k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
399
32.6k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
400
32.6k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
401
32.6k
    _last_frame_size =
402
32.6k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
403
404
32.6k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
405
406
    // read _storage_formats, bit_widths and compute frame_offsets
407
32.6k
    u_int32_t frame_start_offset = 0;
408
81.5k
    for (uint32_t i = 0; i < _frame_count; i++) {
409
48.9k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
410
48.9k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
411
48.9k
        _bit_widths.push_back(bit_width);
412
48.9k
        _storage_formats.push_back(order_flag);
413
414
48.9k
        bit_width_offset += 2;
415
416
48.9k
        _frame_offsets.push_back(frame_start_offset);
417
48.9k
        if (sizeof(T) == 16) {
418
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
419
24.4k
        } else if (sizeof(T) == 8) {
420
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
421
24.4k
        } else {
422
15
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
423
15
        }
424
48.9k
    }
425
426
32.6k
    _out_buffer.resize(_max_frame_size);
427
32.6k
    _parsed = true;
428
429
32.6k
    return true;
430
32.6k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderIsE4initEv
_ZN5doris10ForDecoderIiE4initEv
Line
Count
Source
391
7
bool ForDecoder<T>::init() {
392
    // When row count is zero, the minimum footer size is 5:
393
    // only has ValuesNum(4) + FrameValueNum(1)
394
7
    if (_buffer_len < 5) {
395
0
        return false;
396
0
    }
397
398
7
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
399
7
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
400
7
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
401
7
    _last_frame_size =
402
7
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
403
404
7
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
405
406
    // read _storage_formats, bit_widths and compute frame_offsets
407
7
    u_int32_t frame_start_offset = 0;
408
16
    for (uint32_t i = 0; i < _frame_count; i++) {
409
9
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
410
9
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
411
9
        _bit_widths.push_back(bit_width);
412
9
        _storage_formats.push_back(order_flag);
413
414
9
        bit_width_offset += 2;
415
416
9
        _frame_offsets.push_back(frame_start_offset);
417
9
        if (sizeof(T) == 16) {
418
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
419
9
        } else if (sizeof(T) == 8) {
420
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
421
9
        } else {
422
9
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
423
9
        }
424
9
    }
425
426
7
    _out_buffer.resize(_max_frame_size);
427
7
    _parsed = true;
428
429
7
    return true;
430
7
}
_ZN5doris10ForDecoderIlE4initEv
Line
Count
Source
391
16.3k
bool ForDecoder<T>::init() {
392
    // When row count is zero, the minimum footer size is 5:
393
    // only has ValuesNum(4) + FrameValueNum(1)
394
16.3k
    if (_buffer_len < 5) {
395
0
        return false;
396
0
    }
397
398
16.3k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
399
16.3k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
400
16.3k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
401
16.3k
    _last_frame_size =
402
16.3k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
403
404
16.3k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
405
406
    // read _storage_formats, bit_widths and compute frame_offsets
407
16.3k
    u_int32_t frame_start_offset = 0;
408
40.7k
    for (uint32_t i = 0; i < _frame_count; i++) {
409
24.4k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
410
24.4k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
411
24.4k
        _bit_widths.push_back(bit_width);
412
24.4k
        _storage_formats.push_back(order_flag);
413
414
24.4k
        bit_width_offset += 2;
415
416
24.4k
        _frame_offsets.push_back(frame_start_offset);
417
24.4k
        if (sizeof(T) == 16) {
418
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
419
24.4k
        } else if (sizeof(T) == 8) {
420
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
421
24.4k
        } else {
422
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
423
0
        }
424
24.4k
    }
425
426
16.3k
    _out_buffer.resize(_max_frame_size);
427
16.3k
    _parsed = true;
428
429
16.3k
    return true;
430
16.3k
}
_ZN5doris10ForDecoderInE4initEv
Line
Count
Source
391
16.3k
bool ForDecoder<T>::init() {
392
    // When row count is zero, the minimum footer size is 5:
393
    // only has ValuesNum(4) + FrameValueNum(1)
394
16.3k
    if (_buffer_len < 5) {
395
0
        return false;
396
0
    }
397
398
16.3k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
399
16.3k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
400
16.3k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
401
16.3k
    _last_frame_size =
402
16.3k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
403
404
16.3k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
405
406
    // read _storage_formats, bit_widths and compute frame_offsets
407
16.3k
    u_int32_t frame_start_offset = 0;
408
40.7k
    for (uint32_t i = 0; i < _frame_count; i++) {
409
24.4k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
410
24.4k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
411
24.4k
        _bit_widths.push_back(bit_width);
412
24.4k
        _storage_formats.push_back(order_flag);
413
414
24.4k
        bit_width_offset += 2;
415
416
24.4k
        _frame_offsets.push_back(frame_start_offset);
417
24.4k
        if (sizeof(T) == 16) {
418
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
419
24.4k
        } else if (sizeof(T) == 8) {
420
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
421
0
        } else {
422
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
423
0
        }
424
24.4k
    }
425
426
16.3k
    _out_buffer.resize(_max_frame_size);
427
16.3k
    _parsed = true;
428
429
16.3k
    return true;
430
16.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderItE4initEv
_ZN5doris10ForDecoderIjE4initEv
Line
Count
Source
391
3
bool ForDecoder<T>::init() {
392
    // When row count is zero, the minimum footer size is 5:
393
    // only has ValuesNum(4) + FrameValueNum(1)
394
3
    if (_buffer_len < 5) {
395
0
        return false;
396
0
    }
397
398
3
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
399
3
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
400
3
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
401
3
    _last_frame_size =
402
3
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
403
404
3
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
405
406
    // read _storage_formats, bit_widths and compute frame_offsets
407
3
    u_int32_t frame_start_offset = 0;
408
9
    for (uint32_t i = 0; i < _frame_count; i++) {
409
6
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
410
6
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
411
6
        _bit_widths.push_back(bit_width);
412
6
        _storage_formats.push_back(order_flag);
413
414
6
        bit_width_offset += 2;
415
416
6
        _frame_offsets.push_back(frame_start_offset);
417
6
        if (sizeof(T) == 16) {
418
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
419
6
        } else if (sizeof(T) == 8) {
420
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
421
6
        } else {
422
6
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
423
6
        }
424
6
    }
425
426
3
    _out_buffer.resize(_max_frame_size);
427
3
    _parsed = true;
428
429
3
    return true;
430
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderIoE4initEv
431
432
// todo(kks): improve this method by SIMD instructions
433
434
template <typename T>
435
template <typename U>
436
void ForDecoder<T>::bit_unpack_optimize(const uint8_t* input, uint8_t in_num, int bit_width,
437
81.3k
                                        T* output) {
438
81.3k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
81.3k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
81.3k
    constexpr int u_size = sizeof(U);                   // Size of U
441
81.3k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
81.3k
    int valid_bit = 0;                                  // How many valid bits
443
81.3k
    int need_bit = 0;                                   // still need
444
81.3k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
81.3k
    int full_batch_size =
446
81.3k
            cast_set<int>((input_size >> u_size_shift)
447
81.3k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
81.3k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
81.3k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
81.3k
    T output_mask;
454
81.3k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
81.3k
    } else {
457
81.3k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
81.3k
    }
459
460
81.3k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
4.48M
    for (int i = 0; i < full_batch_size; i += u_size) {
463
4.40M
        s = 0;
464
465
4.40M
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
4.40M
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
4.40M
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
21.7k
            valid_bit -= more_bit;
473
21.7k
            s >>= more_bit;
474
21.7k
        }
475
476
4.40M
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
4.09M
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
4.09M
            auto shifted = s >> (valid_bit - need_bit);
483
4.09M
            auto masked_result = shifted & mask;
484
4.09M
            if constexpr (sizeof(T) <= 4) {
485
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
4.09M
            } else {
487
4.09M
                *output |= static_cast<T>(masked_result);
488
4.09M
            }
489
4.09M
            output++;
490
4.09M
            valid_bit -= need_bit;
491
4.09M
        }
492
493
4.40M
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
4.40M
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
8.51M
        for (int j = 0; j < num; j++) {
501
4.11M
            *output =
502
4.11M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
4.11M
            output++;
504
4.11M
        }
505
506
4.40M
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
4.14M
            if constexpr (sizeof(T) <= 4) {
511
0
                auto masked_value = static_cast<T>(
512
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
4.14M
            } else {
515
4.14M
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
4.14M
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
4.14M
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
4.14M
            need_bit = bit_width - remainder;
520
4.14M
        } else {
521
257k
            need_bit = 0;
522
257k
        }
523
4.40M
    }
524
525
    // remainder
526
81.3k
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
477k
        for (int i = 0; i < tail_count; i++) {
529
417k
            s <<= 8;
530
417k
            s |= input[full_batch_size + i];
531
417k
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
59.2k
        valid_bit = (tail_count << 3) - more_bit;
536
59.2k
        s >>= more_bit;
537
538
        // same as before
539
59.2k
        if (need_bit) {
540
54.0k
            if constexpr (sizeof(T) <= 4) {
541
0
                *output |= static_cast<T>(static_cast<uint32_t>(
542
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
54.0k
            } else {
544
54.0k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
54.0k
                                          ((static_cast<U>(1) << need_bit) - 1));
546
54.0k
            }
547
54.0k
            output++;
548
54.0k
            valid_bit -= need_bit;
549
54.0k
        }
550
551
59.2k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
126k
        for (int j = 0; j < num; j++) {
555
67.2k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
67.2k
            output++;
557
67.2k
        }
558
59.2k
    }
559
81.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeIlEEvPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeInEEvPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeIlEEvPKhhiPs
Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeInEEvPKhhiPs
_ZN5doris10ForDecoderIiE19bit_unpack_optimizeIlEEvPKhhiPi
Line
Count
Source
437
9
                                        T* output) {
438
9
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
9
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
9
    constexpr int u_size = sizeof(U);                   // Size of U
441
9
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
9
    int valid_bit = 0;                                  // How many valid bits
443
9
    int need_bit = 0;                                   // still need
444
9
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
9
    int full_batch_size =
446
9
            cast_set<int>((input_size >> u_size_shift)
447
9
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
9
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
9
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
9
    T output_mask;
454
9
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
9
    } else {
457
9
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
9
    }
459
460
9
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
21
    for (int i = 0; i < full_batch_size; i += u_size) {
463
12
        s = 0;
464
465
12
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
12
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
12
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
7
            valid_bit -= more_bit;
473
7
            s >>= more_bit;
474
7
        }
475
476
12
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
0
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
0
            auto shifted = s >> (valid_bit - need_bit);
483
0
            auto masked_result = shifted & mask;
484
0
            if constexpr (sizeof(T) <= 4) {
485
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
            } else {
487
                *output |= static_cast<T>(masked_result);
488
            }
489
0
            output++;
490
0
            valid_bit -= need_bit;
491
0
        }
492
493
12
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
12
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
780
        for (int j = 0; j < num; j++) {
501
768
            *output =
502
768
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
768
            output++;
504
768
        }
505
506
12
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
0
            if constexpr (sizeof(T) <= 4) {
511
0
                auto masked_value = static_cast<T>(
512
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
            } else {
515
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
0
            need_bit = bit_width - remainder;
520
12
        } else {
521
12
            need_bit = 0;
522
12
        }
523
12
    }
524
525
    // remainder
526
9
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
2
        for (int i = 0; i < tail_count; i++) {
529
1
            s <<= 8;
530
1
            s |= input[full_batch_size + i];
531
1
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
1
        valid_bit = (tail_count << 3) - more_bit;
536
1
        s >>= more_bit;
537
538
        // same as before
539
1
        if (need_bit) {
540
0
            if constexpr (sizeof(T) <= 4) {
541
0
                *output |= static_cast<T>(static_cast<uint32_t>(
542
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
            } else {
544
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
                                          ((static_cast<U>(1) << need_bit) - 1));
546
            }
547
0
            output++;
548
0
            valid_bit -= need_bit;
549
0
        }
550
551
1
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
3
        for (int j = 0; j < num; j++) {
555
2
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
2
            output++;
557
2
        }
558
1
    }
559
9
}
Unexecuted instantiation: _ZN5doris10ForDecoderIiE19bit_unpack_optimizeInEEvPKhhiPi
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeIlEEvPKhhiPl
Line
Count
Source
437
12.3k
                                        T* output) {
438
12.3k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
12.3k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
12.3k
    constexpr int u_size = sizeof(U);                   // Size of U
441
12.3k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
12.3k
    int valid_bit = 0;                                  // How many valid bits
443
12.3k
    int need_bit = 0;                                   // still need
444
12.3k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
12.3k
    int full_batch_size =
446
12.3k
            cast_set<int>((input_size >> u_size_shift)
447
12.3k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
12.3k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
12.3k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
12.3k
    T output_mask;
454
12.3k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
12.3k
    } else {
457
12.3k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
12.3k
    }
459
460
12.3k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
278k
    for (int i = 0; i < full_batch_size; i += u_size) {
463
266k
        s = 0;
464
465
266k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
266k
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
266k
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
5.19k
            valid_bit -= more_bit;
473
5.19k
            s >>= more_bit;
474
5.19k
        }
475
476
266k
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
207k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
207k
            auto shifted = s >> (valid_bit - need_bit);
483
207k
            auto masked_result = shifted & mask;
484
            if constexpr (sizeof(T) <= 4) {
485
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
207k
            } else {
487
207k
                *output |= static_cast<T>(masked_result);
488
207k
            }
489
207k
            output++;
490
207k
            valid_bit -= need_bit;
491
207k
        }
492
493
266k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
266k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
1.07M
        for (int j = 0; j < num; j++) {
501
809k
            *output =
502
809k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
809k
            output++;
504
809k
        }
505
506
266k
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
            if constexpr (sizeof(T) <= 4) {
511
                auto masked_value = static_cast<T>(
512
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
212k
            } else {
515
212k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
212k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
212k
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
212k
            need_bit = bit_width - remainder;
520
212k
        } else {
521
53.9k
            need_bit = 0;
522
53.9k
        }
523
266k
    }
524
525
    // remainder
526
12.3k
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
35.0k
        for (int i = 0; i < tail_count; i++) {
529
28.0k
            s <<= 8;
530
28.0k
            s |= input[full_batch_size + i];
531
28.0k
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
6.98k
        valid_bit = (tail_count << 3) - more_bit;
536
6.98k
        s >>= more_bit;
537
538
        // same as before
539
6.98k
        if (need_bit) {
540
            if constexpr (sizeof(T) <= 4) {
541
                *output |= static_cast<T>(static_cast<uint32_t>(
542
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
5.14k
            } else {
544
5.14k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
5.14k
                                          ((static_cast<U>(1) << need_bit) - 1));
546
5.14k
            }
547
5.14k
            output++;
548
5.14k
            valid_bit -= need_bit;
549
5.14k
        }
550
551
6.98k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
30.6k
        for (int j = 0; j < num; j++) {
555
23.6k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
23.6k
            output++;
557
23.6k
        }
558
6.98k
    }
559
12.3k
}
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeInEEvPKhhiPl
Line
Count
Source
437
12.1k
                                        T* output) {
438
12.1k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
12.1k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
12.1k
    constexpr int u_size = sizeof(U);                   // Size of U
441
12.1k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
12.1k
    int valid_bit = 0;                                  // How many valid bits
443
12.1k
    int need_bit = 0;                                   // still need
444
12.1k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
12.1k
    int full_batch_size =
446
12.1k
            cast_set<int>((input_size >> u_size_shift)
447
12.1k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
12.1k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
12.1k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
12.1k
    T output_mask;
454
12.1k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
12.1k
    } else {
457
12.1k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
12.1k
    }
459
460
12.1k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
403k
    for (int i = 0; i < full_batch_size; i += u_size) {
463
391k
        s = 0;
464
465
391k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
391k
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
391k
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
4.55k
            valid_bit -= more_bit;
473
4.55k
            s >>= more_bit;
474
4.55k
        }
475
476
391k
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
367k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
367k
            auto shifted = s >> (valid_bit - need_bit);
483
367k
            auto masked_result = shifted & mask;
484
            if constexpr (sizeof(T) <= 4) {
485
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
367k
            } else {
487
367k
                *output |= static_cast<T>(masked_result);
488
367k
            }
489
367k
            output++;
490
367k
            valid_bit -= need_bit;
491
367k
        }
492
493
391k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
391k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
1.05M
        for (int j = 0; j < num; j++) {
501
663k
            *output =
502
663k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
663k
            output++;
504
663k
        }
505
506
391k
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
            if constexpr (sizeof(T) <= 4) {
511
                auto masked_value = static_cast<T>(
512
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
374k
            } else {
515
374k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
374k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
374k
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
374k
            need_bit = bit_width - remainder;
520
374k
        } else {
521
16.9k
            need_bit = 0;
522
16.9k
        }
523
391k
    }
524
525
    // remainder
526
12.1k
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
68.5k
        for (int i = 0; i < tail_count; i++) {
529
60.9k
            s <<= 8;
530
60.9k
            s |= input[full_batch_size + i];
531
60.9k
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
7.60k
        valid_bit = (tail_count << 3) - more_bit;
536
7.60k
        s >>= more_bit;
537
538
        // same as before
539
7.60k
        if (need_bit) {
540
            if constexpr (sizeof(T) <= 4) {
541
                *output |= static_cast<T>(static_cast<uint32_t>(
542
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
7.31k
            } else {
544
7.31k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
7.31k
                                          ((static_cast<U>(1) << need_bit) - 1));
546
7.31k
            }
547
7.31k
            output++;
548
7.31k
            valid_bit -= need_bit;
549
7.31k
        }
550
551
7.60k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
14.1k
        for (int j = 0; j < num; j++) {
555
6.51k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
6.51k
            output++;
557
6.51k
        }
558
7.60k
    }
559
12.1k
}
_ZN5doris10ForDecoderInE19bit_unpack_optimizeIlEEvPKhhiPn
Line
Count
Source
437
8.28k
                                        T* output) {
438
8.28k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
8.28k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
8.28k
    constexpr int u_size = sizeof(U);                   // Size of U
441
8.28k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
8.28k
    int valid_bit = 0;                                  // How many valid bits
443
8.28k
    int need_bit = 0;                                   // still need
444
8.28k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
8.28k
    int full_batch_size =
446
8.28k
            cast_set<int>((input_size >> u_size_shift)
447
8.28k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
8.28k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
8.28k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
8.28k
    T output_mask;
454
8.28k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
8.28k
    } else {
457
8.28k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
8.28k
    }
459
460
8.28k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
274k
    for (int i = 0; i < full_batch_size; i += u_size) {
463
266k
        s = 0;
464
465
266k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
266k
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
266k
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
1.12k
            valid_bit -= more_bit;
473
1.12k
            s >>= more_bit;
474
1.12k
        }
475
476
266k
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
207k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
207k
            auto shifted = s >> (valid_bit - need_bit);
483
207k
            auto masked_result = shifted & mask;
484
            if constexpr (sizeof(T) <= 4) {
485
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
207k
            } else {
487
207k
                *output |= static_cast<T>(masked_result);
488
207k
            }
489
207k
            output++;
490
207k
            valid_bit -= need_bit;
491
207k
        }
492
493
266k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
266k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
1.07M
        for (int j = 0; j < num; j++) {
501
808k
            *output =
502
808k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
808k
            output++;
504
808k
        }
505
506
266k
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
            if constexpr (sizeof(T) <= 4) {
511
                auto masked_value = static_cast<T>(
512
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
212k
            } else {
515
212k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
212k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
212k
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
212k
            need_bit = bit_width - remainder;
520
212k
        } else {
521
53.9k
            need_bit = 0;
522
53.9k
        }
523
266k
    }
524
525
    // remainder
526
8.28k
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
35.2k
        for (int i = 0; i < tail_count; i++) {
529
28.1k
            s <<= 8;
530
28.1k
            s |= input[full_batch_size + i];
531
28.1k
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
7.04k
        valid_bit = (tail_count << 3) - more_bit;
536
7.04k
        s >>= more_bit;
537
538
        // same as before
539
7.04k
        if (need_bit) {
540
            if constexpr (sizeof(T) <= 4) {
541
                *output |= static_cast<T>(static_cast<uint32_t>(
542
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
5.14k
            } else {
544
5.14k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
5.14k
                                          ((static_cast<U>(1) << need_bit) - 1));
546
5.14k
            }
547
5.14k
            output++;
548
5.14k
            valid_bit -= need_bit;
549
5.14k
        }
550
551
7.04k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
30.7k
        for (int j = 0; j < num; j++) {
555
23.7k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
23.7k
            output++;
557
23.7k
        }
558
7.04k
    }
559
8.28k
}
_ZN5doris10ForDecoderInE19bit_unpack_optimizeInEEvPKhhiPn
Line
Count
Source
437
48.5k
                                        T* output) {
438
48.5k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
48.5k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
48.5k
    constexpr int u_size = sizeof(U);                   // Size of U
441
48.5k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
48.5k
    int valid_bit = 0;                                  // How many valid bits
443
48.5k
    int need_bit = 0;                                   // still need
444
48.5k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
48.5k
    int full_batch_size =
446
48.5k
            cast_set<int>((input_size >> u_size_shift)
447
48.5k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
48.5k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
48.5k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
48.5k
    T output_mask;
454
48.5k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
48.5k
    } else {
457
48.5k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
48.5k
    }
459
460
48.5k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
3.52M
    for (int i = 0; i < full_batch_size; i += u_size) {
463
3.47M
        s = 0;
464
465
3.47M
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
3.47M
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
3.47M
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
10.8k
            valid_bit -= more_bit;
473
10.8k
            s >>= more_bit;
474
10.8k
        }
475
476
3.47M
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
3.30M
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
3.30M
            auto shifted = s >> (valid_bit - need_bit);
483
3.30M
            auto masked_result = shifted & mask;
484
            if constexpr (sizeof(T) <= 4) {
485
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
3.30M
            } else {
487
3.30M
                *output |= static_cast<T>(masked_result);
488
3.30M
            }
489
3.30M
            output++;
490
3.30M
            valid_bit -= need_bit;
491
3.30M
        }
492
493
3.47M
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
3.47M
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
5.30M
        for (int j = 0; j < num; j++) {
501
1.83M
            *output =
502
1.83M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
1.83M
            output++;
504
1.83M
        }
505
506
3.47M
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
            if constexpr (sizeof(T) <= 4) {
511
                auto masked_value = static_cast<T>(
512
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
3.34M
            } else {
515
3.34M
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
3.34M
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
3.34M
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
3.34M
            need_bit = bit_width - remainder;
520
3.34M
        } else {
521
132k
            need_bit = 0;
522
132k
        }
523
3.47M
    }
524
525
    // remainder
526
48.5k
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
338k
        for (int i = 0; i < tail_count; i++) {
529
300k
            s <<= 8;
530
300k
            s |= input[full_batch_size + i];
531
300k
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
37.6k
        valid_bit = (tail_count << 3) - more_bit;
536
37.6k
        s >>= more_bit;
537
538
        // same as before
539
37.6k
        if (need_bit) {
540
            if constexpr (sizeof(T) <= 4) {
541
                *output |= static_cast<T>(static_cast<uint32_t>(
542
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
36.4k
            } else {
544
36.4k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
36.4k
                                          ((static_cast<U>(1) << need_bit) - 1));
546
36.4k
            }
547
36.4k
            output++;
548
36.4k
            valid_bit -= need_bit;
549
36.4k
        }
550
551
37.6k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
50.9k
        for (int j = 0; j < num; j++) {
555
13.3k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
13.3k
            output++;
557
13.3k
        }
558
37.6k
    }
559
48.5k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeIlEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeInEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeIlEEvPKhhiPt
Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeInEEvPKhhiPt
_ZN5doris10ForDecoderIjE19bit_unpack_optimizeIlEEvPKhhiPj
Line
Count
Source
437
5
                                        T* output) {
438
5
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
439
5
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
440
5
    constexpr int u_size = sizeof(U);                   // Size of U
441
5
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
442
5
    int valid_bit = 0;                                  // How many valid bits
443
5
    int need_bit = 0;                                   // still need
444
5
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
445
5
    int full_batch_size =
446
5
            cast_set<int>((input_size >> u_size_shift)
447
5
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
448
5
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
449
    // The number of bits in input to adjust to multiples of 8 and thus more
450
5
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
451
452
    // to ensure that only bit_width bits are valid
453
5
    T output_mask;
454
5
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
455
0
        output_mask = static_cast<T>(~T(0));
456
5
    } else {
457
5
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
458
5
    }
459
460
5
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
461
462
15
    for (int i = 0; i < full_batch_size; i += u_size) {
463
10
        s = 0;
464
465
10
        s = to_endian<std::endian::big>(*((U*)(input + i)));
466
467
        // Determine what the valid bits are based on u_size
468
10
        valid_bit = u_size << 3;
469
470
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
471
10
        if (tail_count == 0 && i == full_batch_size - u_size) {
472
5
            valid_bit -= more_bit;
473
5
            s >>= more_bit;
474
5
        }
475
476
10
        if (need_bit) {
477
            // The last time we take away the high bit_width - need_bit,
478
            // we need to make up the rest of the need_bit from the width.
479
            // Use valid_bit - need_bit to compute high need_bit bits of s
480
            // perform an AND operation to ensure that only need_bit bits are valid
481
0
            auto mask = (static_cast<U>(1) << need_bit) - 1;
482
0
            auto shifted = s >> (valid_bit - need_bit);
483
0
            auto masked_result = shifted & mask;
484
0
            if constexpr (sizeof(T) <= 4) {
485
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
486
            } else {
487
                *output |= static_cast<T>(masked_result);
488
            }
489
0
            output++;
490
0
            valid_bit -= need_bit;
491
0
        }
492
493
10
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
494
10
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
495
496
        // Starting with the highest valid bit, take out bit_width bits in sequence
497
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
498
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
499
        // But since there are still remainder bits that can't be processed, need to add the remainder
500
650
        for (int j = 0; j < num; j++) {
501
640
            *output =
502
640
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
503
640
            output++;
504
640
        }
505
506
10
        if (remainder) {
507
            // Process the last remaining remainder bit.
508
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
509
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
510
0
            if constexpr (sizeof(T) <= 4) {
511
0
                auto masked_value = static_cast<T>(
512
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
513
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
514
            } else {
515
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
516
                *output = static_cast<T>(masked_value << (bit_width - remainder));
517
            }
518
            // Already have remainder bits, next time need bit_width - remainder bits
519
0
            need_bit = bit_width - remainder;
520
10
        } else {
521
10
            need_bit = 0;
522
10
        }
523
10
    }
524
525
    // remainder
526
5
    if (tail_count) {
527
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
528
0
        for (int i = 0; i < tail_count; i++) {
529
0
            s <<= 8;
530
0
            s |= input[full_batch_size + i];
531
0
        }
532
533
        // tail * 8 is the number of bits that are left to process
534
        // tail * 8 - more_bit is to remove the last more_bit
535
0
        valid_bit = (tail_count << 3) - more_bit;
536
0
        s >>= more_bit;
537
538
        // same as before
539
0
        if (need_bit) {
540
0
            if constexpr (sizeof(T) <= 4) {
541
0
                *output |= static_cast<T>(static_cast<uint32_t>(
542
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
543
            } else {
544
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
545
                                          ((static_cast<U>(1) << need_bit) - 1));
546
            }
547
0
            output++;
548
0
            valid_bit -= need_bit;
549
0
        }
550
551
0
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
552
553
        // same as before
554
0
        for (int j = 0; j < num; j++) {
555
0
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
556
0
            output++;
557
0
        }
558
0
    }
559
5
}
Unexecuted instantiation: _ZN5doris10ForDecoderIjE19bit_unpack_optimizeInEEvPKhhiPj
Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeIlEEvPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeInEEvPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeIlEEvPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeInEEvPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeIlEEvPKhhiPo
Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeInEEvPKhhiPo
560
561
// The reverse of bit_pack method, get original integer data list from packed bits
562
// param[in] input: the packed bits need to unpack
563
// param[in] in_num: the integer number in packed bits
564
// param[in] bit_width: how many bit we used to store each integer data
565
// param[out] output: the original integer data list
566
template <typename T>
567
81.3k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
568
    /*
569
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
570
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
571
    */
572
81.3k
    if (bit_width <= 32) {
573
20.6k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
574
60.6k
    } else {
575
60.6k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
576
60.6k
    }
577
81.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE10bit_unpackEPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE10bit_unpackEPKhhiPs
_ZN5doris10ForDecoderIiE10bit_unpackEPKhhiPi
Line
Count
Source
567
9
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
568
    /*
569
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
570
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
571
    */
572
9
    if (bit_width <= 32) {
573
9
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
574
9
    } else {
575
0
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
576
0
    }
577
9
}
_ZN5doris10ForDecoderIlE10bit_unpackEPKhhiPl
Line
Count
Source
567
24.4k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
568
    /*
569
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
570
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
571
    */
572
24.4k
    if (bit_width <= 32) {
573
12.3k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
574
12.3k
    } else {
575
12.1k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
576
12.1k
    }
577
24.4k
}
_ZN5doris10ForDecoderInE10bit_unpackEPKhhiPn
Line
Count
Source
567
56.8k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
568
    /*
569
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
570
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
571
    */
572
56.8k
    if (bit_width <= 32) {
573
8.28k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
574
48.5k
    } else {
575
48.5k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
576
48.5k
    }
577
56.8k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10bit_unpackEPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE10bit_unpackEPKhhiPt
_ZN5doris10ForDecoderIjE10bit_unpackEPKhhiPj
Line
Count
Source
567
5
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
568
    /*
569
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
570
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
571
    */
572
5
    if (bit_width <= 32) {
573
5
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
574
5
    } else {
575
0
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
576
0
    }
577
5
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE10bit_unpackEPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10bit_unpackEPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE10bit_unpackEPKhhiPo
578
579
template <typename T>
580
4.17M
void ForDecoder<T>::decode_current_frame(T* output) {
581
4.17M
    uint32_t frame_index = _current_index / _max_frame_size;
582
4.17M
    if (frame_index == _current_decoded_frame) {
583
4.12M
        return; // current frame already decoded
584
4.12M
    }
585
48.9k
    _current_decoded_frame = frame_index;
586
48.9k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
587
588
48.9k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
589
48.9k
    T min = 0;
590
48.9k
    uint32_t delta_offset = 0;
591
48.9k
    if constexpr (sizeof(T) == 16) {
592
24.4k
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
593
24.4k
        delta_offset = base_offset + 16;
594
24.4k
    } else if constexpr (sizeof(T) == 8) {
595
24.4k
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
596
24.4k
        delta_offset = base_offset + 8;
597
24.4k
    } else {
598
14
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
599
14
        delta_offset = base_offset + 4;
600
14
    }
601
602
48.9k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
603
604
48.9k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
605
48.9k
    if (is_original_value) {
606
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
607
48.9k
    } else {
608
48.9k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
609
48.9k
        std::vector<T> delta_values(current_frame_size);
610
48.9k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
611
48.9k
        if (is_ascending) {
612
451
            T pre_value = min;
613
3.87k
            for (uint8_t i = 0; i < current_frame_size; i++) {
614
3.42k
                T value = delta_values[i] + pre_value;
615
3.42k
                output[i] = value;
616
3.42k
                pre_value = value;
617
3.42k
            }
618
48.4k
        } else {
619
4.22M
            for (uint8_t i = 0; i < current_frame_size; i++) {
620
4.17M
                output[i] = delta_values[i] + min;
621
4.17M
            }
622
48.4k
        }
623
48.9k
    }
624
48.9k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE20decode_current_frameEPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE20decode_current_frameEPs
_ZN5doris10ForDecoderIiE20decode_current_frameEPi
Line
Count
Source
580
10
void ForDecoder<T>::decode_current_frame(T* output) {
581
10
    uint32_t frame_index = _current_index / _max_frame_size;
582
10
    if (frame_index == _current_decoded_frame) {
583
1
        return; // current frame already decoded
584
1
    }
585
9
    _current_decoded_frame = frame_index;
586
9
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
587
588
9
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
589
9
    T min = 0;
590
9
    uint32_t delta_offset = 0;
591
    if constexpr (sizeof(T) == 16) {
592
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
593
        delta_offset = base_offset + 16;
594
    } else if constexpr (sizeof(T) == 8) {
595
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
596
        delta_offset = base_offset + 8;
597
9
    } else {
598
9
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
599
9
        delta_offset = base_offset + 4;
600
9
    }
601
602
9
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
603
604
9
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
605
9
    if (is_original_value) {
606
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
607
9
    } else {
608
9
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
609
9
        std::vector<T> delta_values(current_frame_size);
610
9
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
611
9
        if (is_ascending) {
612
9
            T pre_value = min;
613
780
            for (uint8_t i = 0; i < current_frame_size; i++) {
614
771
                T value = delta_values[i] + pre_value;
615
771
                output[i] = value;
616
771
                pre_value = value;
617
771
            }
618
9
        } else {
619
0
            for (uint8_t i = 0; i < current_frame_size; i++) {
620
0
                output[i] = delta_values[i] + min;
621
0
            }
622
0
        }
623
9
    }
624
9
}
_ZN5doris10ForDecoderIlE20decode_current_frameEPl
Line
Count
Source
580
2.08M
void ForDecoder<T>::decode_current_frame(T* output) {
581
2.08M
    uint32_t frame_index = _current_index / _max_frame_size;
582
2.08M
    if (frame_index == _current_decoded_frame) {
583
2.06M
        return; // current frame already decoded
584
2.06M
    }
585
24.4k
    _current_decoded_frame = frame_index;
586
24.4k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
587
588
24.4k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
589
24.4k
    T min = 0;
590
24.4k
    uint32_t delta_offset = 0;
591
    if constexpr (sizeof(T) == 16) {
592
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
593
        delta_offset = base_offset + 16;
594
24.4k
    } else if constexpr (sizeof(T) == 8) {
595
24.4k
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
596
24.4k
        delta_offset = base_offset + 8;
597
    } else {
598
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
599
        delta_offset = base_offset + 4;
600
    }
601
602
24.4k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
603
604
24.4k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
605
24.4k
    if (is_original_value) {
606
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
607
24.4k
    } else {
608
24.4k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
609
24.4k
        std::vector<T> delta_values(current_frame_size);
610
24.4k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
611
24.4k
        if (is_ascending) {
612
223
            T pre_value = min;
613
1.89k
            for (uint8_t i = 0; i < current_frame_size; i++) {
614
1.67k
                T value = delta_values[i] + pre_value;
615
1.67k
                output[i] = value;
616
1.67k
                pre_value = value;
617
1.67k
            }
618
24.2k
        } else {
619
2.11M
            for (uint8_t i = 0; i < current_frame_size; i++) {
620
2.08M
                output[i] = delta_values[i] + min;
621
2.08M
            }
622
24.2k
        }
623
24.4k
    }
624
24.4k
}
_ZN5doris10ForDecoderInE20decode_current_frameEPn
Line
Count
Source
580
2.08M
void ForDecoder<T>::decode_current_frame(T* output) {
581
2.08M
    uint32_t frame_index = _current_index / _max_frame_size;
582
2.08M
    if (frame_index == _current_decoded_frame) {
583
2.06M
        return; // current frame already decoded
584
2.06M
    }
585
24.4k
    _current_decoded_frame = frame_index;
586
24.4k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
587
588
24.4k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
589
24.4k
    T min = 0;
590
24.4k
    uint32_t delta_offset = 0;
591
24.4k
    if constexpr (sizeof(T) == 16) {
592
24.4k
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
593
24.4k
        delta_offset = base_offset + 16;
594
    } else if constexpr (sizeof(T) == 8) {
595
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
596
        delta_offset = base_offset + 8;
597
    } else {
598
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
599
        delta_offset = base_offset + 4;
600
    }
601
602
24.4k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
603
604
24.4k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
605
24.4k
    if (is_original_value) {
606
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
607
24.4k
    } else {
608
24.4k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
609
24.4k
        std::vector<T> delta_values(current_frame_size);
610
24.4k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
611
24.4k
        if (is_ascending) {
612
214
            T pre_value = min;
613
552
            for (uint8_t i = 0; i < current_frame_size; i++) {
614
338
                T value = delta_values[i] + pre_value;
615
338
                output[i] = value;
616
338
                pre_value = value;
617
338
            }
618
24.2k
        } else {
619
2.11M
            for (uint8_t i = 0; i < current_frame_size; i++) {
620
2.08M
                output[i] = delta_values[i] + min;
621
2.08M
            }
622
24.2k
        }
623
24.4k
    }
624
24.4k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE20decode_current_frameEPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE20decode_current_frameEPt
_ZN5doris10ForDecoderIjE20decode_current_frameEPj
Line
Count
Source
580
5
void ForDecoder<T>::decode_current_frame(T* output) {
581
5
    uint32_t frame_index = _current_index / _max_frame_size;
582
5
    if (frame_index == _current_decoded_frame) {
583
0
        return; // current frame already decoded
584
0
    }
585
5
    _current_decoded_frame = frame_index;
586
5
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
587
588
5
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
589
5
    T min = 0;
590
5
    uint32_t delta_offset = 0;
591
    if constexpr (sizeof(T) == 16) {
592
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
593
        delta_offset = base_offset + 16;
594
    } else if constexpr (sizeof(T) == 8) {
595
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
596
        delta_offset = base_offset + 8;
597
5
    } else {
598
5
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
599
5
        delta_offset = base_offset + 4;
600
5
    }
601
602
5
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
603
604
5
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
605
5
    if (is_original_value) {
606
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
607
5
    } else {
608
5
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
609
5
        std::vector<T> delta_values(current_frame_size);
610
5
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
611
5
        if (is_ascending) {
612
5
            T pre_value = min;
613
645
            for (uint8_t i = 0; i < current_frame_size; i++) {
614
640
                T value = delta_values[i] + pre_value;
615
640
                output[i] = value;
616
640
                pre_value = value;
617
640
            }
618
5
        } else {
619
0
            for (uint8_t i = 0; i < current_frame_size; i++) {
620
0
                output[i] = delta_values[i] + min;
621
0
            }
622
0
        }
623
5
    }
624
5
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE20decode_current_frameEPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE20decode_current_frameEPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE20decode_current_frameEPo
625
626
template <typename T>
627
12
T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) {
628
12
    uint32_t min_offset = _frame_offsets[frame_index];
629
12
    T min = 0;
630
12
    if constexpr (sizeof(T) == 16) {
631
0
        min = static_cast<T>(decode_fixed128_le(_buffer + min_offset));
632
12
    } else if constexpr (sizeof(T) == 8) {
633
12
        min = static_cast<T>(decode_fixed64_le(_buffer + min_offset));
634
12
    } else {
635
0
        min = static_cast<T>(decode_fixed32_le(_buffer + min_offset));
636
0
    }
637
12
    return min;
638
12
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIsE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIiE22decode_frame_min_valueEj
_ZN5doris10ForDecoderIlE22decode_frame_min_valueEj
Line
Count
Source
627
12
T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) {
628
12
    uint32_t min_offset = _frame_offsets[frame_index];
629
12
    T min = 0;
630
    if constexpr (sizeof(T) == 16) {
631
        min = static_cast<T>(decode_fixed128_le(_buffer + min_offset));
632
12
    } else if constexpr (sizeof(T) == 8) {
633
12
        min = static_cast<T>(decode_fixed64_le(_buffer + min_offset));
634
    } else {
635
        min = static_cast<T>(decode_fixed32_le(_buffer + min_offset));
636
    }
637
12
    return min;
638
12
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIhE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderItE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIjE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderImE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIoE22decode_frame_min_valueEj
639
640
template <typename T>
641
4.17M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
642
4.17M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
643
4.17M
    _current_index += count;
644
4.17M
    val += count;
645
4.17M
    return val;
646
4.17M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE10copy_valueEPam
Unexecuted instantiation: _ZN5doris10ForDecoderIsE10copy_valueEPsm
_ZN5doris10ForDecoderIiE10copy_valueEPim
Line
Count
Source
641
8
T* ForDecoder<T>::copy_value(T* val, size_t count) {
642
8
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
643
8
    _current_index += count;
644
8
    val += count;
645
8
    return val;
646
8
}
_ZN5doris10ForDecoderIlE10copy_valueEPlm
Line
Count
Source
641
2.08M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
642
2.08M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
643
2.08M
    _current_index += count;
644
2.08M
    val += count;
645
2.08M
    return val;
646
2.08M
}
_ZN5doris10ForDecoderInE10copy_valueEPnm
Line
Count
Source
641
2.08M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
642
2.08M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
643
2.08M
    _current_index += count;
644
2.08M
    val += count;
645
2.08M
    return val;
646
2.08M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10copy_valueEPhm
Unexecuted instantiation: _ZN5doris10ForDecoderItE10copy_valueEPtm
_ZN5doris10ForDecoderIjE10copy_valueEPjm
Line
Count
Source
641
3
T* ForDecoder<T>::copy_value(T* val, size_t count) {
642
3
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
643
3
    _current_index += count;
644
3
    val += count;
645
3
    return val;
646
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE10copy_valueEPmm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10copy_valueEPS1_m
Unexecuted instantiation: _ZN5doris10ForDecoderIoE10copy_valueEPom
647
648
template <typename T>
649
4.17M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
650
4.17M
    if (_current_index + count > _values_num) {
651
1
        return false;
652
1
    }
653
654
4.17M
    decode_current_frame(_out_buffer.data());
655
656
4.17M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
657
4.16M
        copy_value(val, count);
658
4.16M
        return true;
659
4.16M
    }
660
661
    // 1. padding one frame
662
16.3k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
663
16.3k
    val = copy_value(val, padding_num);
664
665
    // 2. process frame by frame
666
16.3k
    size_t frame_count = (count - padding_num) / _max_frame_size;
667
16.4k
    for (size_t i = 0; i < frame_count; i++) {
668
        // directly decode value to the output, don't  buffer the value
669
7
        decode_current_frame(val);
670
7
        _current_index += _max_frame_size;
671
7
        val += _max_frame_size;
672
7
    }
673
674
    // 3. process remaining value
675
16.3k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
676
16.3k
    if (remaining_num > 0) {
677
4
        decode_current_frame(_out_buffer.data());
678
4
        val = copy_value(val, remaining_num);
679
4
    }
680
681
16.3k
    return true;
682
4.17M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE9get_batchEPam
Unexecuted instantiation: _ZN5doris10ForDecoderIsE9get_batchEPsm
_ZN5doris10ForDecoderIiE9get_batchEPim
Line
Count
Source
649
8
bool ForDecoder<T>::get_batch(T* val, size_t count) {
650
8
    if (_current_index + count > _values_num) {
651
1
        return false;
652
1
    }
653
654
7
    decode_current_frame(_out_buffer.data());
655
656
7
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
657
4
        copy_value(val, count);
658
4
        return true;
659
4
    }
660
661
    // 1. padding one frame
662
3
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
663
3
    val = copy_value(val, padding_num);
664
665
    // 2. process frame by frame
666
3
    size_t frame_count = (count - padding_num) / _max_frame_size;
667
5
    for (size_t i = 0; i < frame_count; i++) {
668
        // directly decode value to the output, don't  buffer the value
669
2
        decode_current_frame(val);
670
2
        _current_index += _max_frame_size;
671
2
        val += _max_frame_size;
672
2
    }
673
674
    // 3. process remaining value
675
3
    size_t remaining_num = (count - padding_num) % _max_frame_size;
676
3
    if (remaining_num > 0) {
677
1
        decode_current_frame(_out_buffer.data());
678
1
        val = copy_value(val, remaining_num);
679
1
    }
680
681
3
    return true;
682
7
}
_ZN5doris10ForDecoderIlE9get_batchEPlm
Line
Count
Source
649
2.08M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
650
2.08M
    if (_current_index + count > _values_num) {
651
0
        return false;
652
0
    }
653
654
2.08M
    decode_current_frame(_out_buffer.data());
655
656
2.08M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
657
2.08M
        copy_value(val, count);
658
2.08M
        return true;
659
2.08M
    }
660
661
    // 1. padding one frame
662
8.19k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
663
8.19k
    val = copy_value(val, padding_num);
664
665
    // 2. process frame by frame
666
8.19k
    size_t frame_count = (count - padding_num) / _max_frame_size;
667
8.19k
    for (size_t i = 0; i < frame_count; i++) {
668
        // directly decode value to the output, don't  buffer the value
669
3
        decode_current_frame(val);
670
3
        _current_index += _max_frame_size;
671
3
        val += _max_frame_size;
672
3
    }
673
674
    // 3. process remaining value
675
8.19k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
676
8.19k
    if (remaining_num > 0) {
677
3
        decode_current_frame(_out_buffer.data());
678
3
        val = copy_value(val, remaining_num);
679
3
    }
680
681
8.19k
    return true;
682
2.08M
}
_ZN5doris10ForDecoderInE9get_batchEPnm
Line
Count
Source
649
2.08M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
650
2.08M
    if (_current_index + count > _values_num) {
651
0
        return false;
652
0
    }
653
654
2.08M
    decode_current_frame(_out_buffer.data());
655
656
2.08M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
657
2.08M
        copy_value(val, count);
658
2.08M
        return true;
659
2.08M
    }
660
661
    // 1. padding one frame
662
8.19k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
663
8.19k
    val = copy_value(val, padding_num);
664
665
    // 2. process frame by frame
666
8.19k
    size_t frame_count = (count - padding_num) / _max_frame_size;
667
8.19k
    for (size_t i = 0; i < frame_count; i++) {
668
        // directly decode value to the output, don't  buffer the value
669
0
        decode_current_frame(val);
670
0
        _current_index += _max_frame_size;
671
0
        val += _max_frame_size;
672
0
    }
673
674
    // 3. process remaining value
675
8.19k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
676
8.19k
    if (remaining_num > 0) {
677
0
        decode_current_frame(_out_buffer.data());
678
0
        val = copy_value(val, remaining_num);
679
0
    }
680
681
8.19k
    return true;
682
2.08M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE9get_batchEPhm
Unexecuted instantiation: _ZN5doris10ForDecoderItE9get_batchEPtm
_ZN5doris10ForDecoderIjE9get_batchEPjm
Line
Count
Source
649
3
bool ForDecoder<T>::get_batch(T* val, size_t count) {
650
3
    if (_current_index + count > _values_num) {
651
0
        return false;
652
0
    }
653
654
3
    decode_current_frame(_out_buffer.data());
655
656
3
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
657
0
        copy_value(val, count);
658
0
        return true;
659
0
    }
660
661
    // 1. padding one frame
662
3
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
663
3
    val = copy_value(val, padding_num);
664
665
    // 2. process frame by frame
666
3
    size_t frame_count = (count - padding_num) / _max_frame_size;
667
5
    for (size_t i = 0; i < frame_count; i++) {
668
        // directly decode value to the output, don't  buffer the value
669
2
        decode_current_frame(val);
670
2
        _current_index += _max_frame_size;
671
2
        val += _max_frame_size;
672
2
    }
673
674
    // 3. process remaining value
675
3
    size_t remaining_num = (count - padding_num) % _max_frame_size;
676
3
    if (remaining_num > 0) {
677
0
        decode_current_frame(_out_buffer.data());
678
0
        val = copy_value(val, remaining_num);
679
0
    }
680
681
3
    return true;
682
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE9get_batchEPmm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE9get_batchEPS1_m
Unexecuted instantiation: _ZN5doris10ForDecoderIoE9get_batchEPom
683
684
template <typename T>
685
3
bool ForDecoder<T>::skip(int32_t skip_num) {
686
3
    if (_current_index + skip_num >= _values_num) {
687
0
        return false;
688
0
    }
689
3
    _current_index = _current_index + skip_num;
690
3
    return true;
691
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIsE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIiE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIlE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderInE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderItE4skipEi
_ZN5doris10ForDecoderIjE4skipEi
Line
Count
Source
685
3
bool ForDecoder<T>::skip(int32_t skip_num) {
686
3
    if (_current_index + skip_num >= _values_num) {
687
0
        return false;
688
0
    }
689
3
    _current_index = _current_index + skip_num;
690
3
    return true;
691
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIoE4skipEi
692
693
template <typename T>
694
6
uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) {
695
    // first of all, find the first frame >= target
696
6
    uint32_t left = 0;
697
6
    uint32_t right = _frame_count;
698
18
    while (left < right) {
699
12
        uint32_t mid = left + (right - left) / 2;
700
12
        T midValue = decode_frame_min_value(mid);
701
12
        if (midValue < target) {
702
6
            left = mid + 1;
703
6
        } else {
704
6
            right = mid;
705
6
        }
706
12
    }
707
    // after loop, left is the first frame >= target
708
6
    if (left == 0) {
709
        // all frames are >= target, not found
710
2
        return _frame_count;
711
2
    }
712
    // otherwise previous frame is the last frame < target
713
4
    return left - 1;
714
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE28seek_last_frame_before_valueEa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE28seek_last_frame_before_valueEs
Unexecuted instantiation: _ZN5doris10ForDecoderIiE28seek_last_frame_before_valueEi
_ZN5doris10ForDecoderIlE28seek_last_frame_before_valueEl
Line
Count
Source
694
6
uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) {
695
    // first of all, find the first frame >= target
696
6
    uint32_t left = 0;
697
6
    uint32_t right = _frame_count;
698
18
    while (left < right) {
699
12
        uint32_t mid = left + (right - left) / 2;
700
12
        T midValue = decode_frame_min_value(mid);
701
12
        if (midValue < target) {
702
6
            left = mid + 1;
703
6
        } else {
704
6
            right = mid;
705
6
        }
706
12
    }
707
    // after loop, left is the first frame >= target
708
6
    if (left == 0) {
709
        // all frames are >= target, not found
710
2
        return _frame_count;
711
2
    }
712
    // otherwise previous frame is the last frame < target
713
4
    return left - 1;
714
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE28seek_last_frame_before_valueEn
Unexecuted instantiation: _ZN5doris10ForDecoderIhE28seek_last_frame_before_valueEh
Unexecuted instantiation: _ZN5doris10ForDecoderItE28seek_last_frame_before_valueEt
Unexecuted instantiation: _ZN5doris10ForDecoderIjE28seek_last_frame_before_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderImE28seek_last_frame_before_valueEm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE28seek_last_frame_before_valueES1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE28seek_last_frame_before_valueEo
715
716
template <typename T>
717
bool ForDecoder<T>::seek_lower_bound_inside_frame(uint32_t frame_index, T target,
718
4
                                                  bool* exact_match) {
719
4
    _current_index = frame_index * _max_frame_size;
720
4
    decode_current_frame(_out_buffer.data());
721
4
    auto end = _out_buffer.begin() + frame_size(frame_index);
722
4
    auto pos = std::lower_bound(_out_buffer.begin(), end, target);
723
4
    if (pos != end) { // found in this frame
724
2
        auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos));
725
2
        *exact_match = _out_buffer[pos_in_frame] == target;
726
2
        _current_index += pos_in_frame;
727
2
        return true;
728
2
    }
729
2
    return false;
730
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE29seek_lower_bound_inside_frameEjaPb
Unexecuted instantiation: _ZN5doris10ForDecoderIsE29seek_lower_bound_inside_frameEjsPb
Unexecuted instantiation: _ZN5doris10ForDecoderIiE29seek_lower_bound_inside_frameEjiPb
_ZN5doris10ForDecoderIlE29seek_lower_bound_inside_frameEjlPb
Line
Count
Source
718
4
                                                  bool* exact_match) {
719
4
    _current_index = frame_index * _max_frame_size;
720
4
    decode_current_frame(_out_buffer.data());
721
4
    auto end = _out_buffer.begin() + frame_size(frame_index);
722
4
    auto pos = std::lower_bound(_out_buffer.begin(), end, target);
723
4
    if (pos != end) { // found in this frame
724
2
        auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos));
725
2
        *exact_match = _out_buffer[pos_in_frame] == target;
726
2
        _current_index += pos_in_frame;
727
2
        return true;
728
2
    }
729
2
    return false;
730
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE29seek_lower_bound_inside_frameEjnPb
Unexecuted instantiation: _ZN5doris10ForDecoderIhE29seek_lower_bound_inside_frameEjhPb
Unexecuted instantiation: _ZN5doris10ForDecoderItE29seek_lower_bound_inside_frameEjtPb
Unexecuted instantiation: _ZN5doris10ForDecoderIjE29seek_lower_bound_inside_frameEjjPb
Unexecuted instantiation: _ZN5doris10ForDecoderImE29seek_lower_bound_inside_frameEjmPb
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE29seek_lower_bound_inside_frameEjS1_Pb
Unexecuted instantiation: _ZN5doris10ForDecoderIoE29seek_lower_bound_inside_frameEjoPb
731
732
template <typename T>
733
6
bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) {
734
6
    T target = *reinterpret_cast<const T*>(value);
735
6
    uint32_t frame_to_search = seek_last_frame_before_value(target);
736
6
    if (frame_to_search == _frame_count) {
737
        // all frames are >= target, the searched value must the be first value
738
2
        _current_index = 0;
739
2
        decode_current_frame(_out_buffer.data());
740
2
        *exact_match = _out_buffer[0] == target;
741
2
        return true;
742
2
    }
743
    // binary search inside the last frame < target
744
4
    bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match);
745
    // if not found, all values in the last frame are less than target.
746
    // then the searched value must be the first value of the next frame.
747
4
    if (!found && frame_to_search < _frame_count - 1) {
748
1
        _current_index = (frame_to_search + 1) * _max_frame_size;
749
1
        decode_current_frame(_out_buffer.data());
750
1
        *exact_match = _out_buffer[0] == target;
751
1
        return true;
752
1
    }
753
3
    return found;
754
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIsE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIiE22seek_at_or_after_valueEPKvPb
_ZN5doris10ForDecoderIlE22seek_at_or_after_valueEPKvPb
Line
Count
Source
733
6
bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) {
734
6
    T target = *reinterpret_cast<const T*>(value);
735
6
    uint32_t frame_to_search = seek_last_frame_before_value(target);
736
6
    if (frame_to_search == _frame_count) {
737
        // all frames are >= target, the searched value must the be first value
738
2
        _current_index = 0;
739
2
        decode_current_frame(_out_buffer.data());
740
2
        *exact_match = _out_buffer[0] == target;
741
2
        return true;
742
2
    }
743
    // binary search inside the last frame < target
744
4
    bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match);
745
    // if not found, all values in the last frame are less than target.
746
    // then the searched value must be the first value of the next frame.
747
4
    if (!found && frame_to_search < _frame_count - 1) {
748
1
        _current_index = (frame_to_search + 1) * _max_frame_size;
749
1
        decode_current_frame(_out_buffer.data());
750
1
        *exact_match = _out_buffer[0] == target;
751
1
        return true;
752
1
    }
753
3
    return found;
754
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIhE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderItE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIjE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderImE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIoE22seek_at_or_after_valueEPKvPb
755
756
template class ForEncoder<int8_t>;
757
template class ForEncoder<int16_t>;
758
template class ForEncoder<int32_t>;
759
template class ForEncoder<int64_t>;
760
template class ForEncoder<int128_t>;
761
template class ForEncoder<uint8_t>;
762
template class ForEncoder<uint16_t>;
763
template class ForEncoder<uint32_t>;
764
template class ForEncoder<uint64_t>;
765
template class ForEncoder<uint24_t>;
766
template class ForEncoder<uint128_t>;
767
768
template class ForDecoder<int8_t>;
769
template class ForDecoder<int16_t>;
770
template class ForDecoder<int32_t>;
771
template class ForDecoder<int64_t>;
772
template class ForDecoder<int128_t>;
773
template class ForDecoder<uint8_t>;
774
template class ForDecoder<uint16_t>;
775
template class ForDecoder<uint32_t>;
776
template class ForDecoder<uint64_t>;
777
template class ForDecoder<uint24_t>;
778
template class ForDecoder<uint128_t>;
779
} // namespace doris