Coverage Report

Created: 2026-03-15 08:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/frame_of_reference_coding.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/frame_of_reference_coding.h"
19
20
#include <glog/logging.h>
21
#include <sys/types.h>
22
23
#include <algorithm>
24
#include <cstring>
25
#include <iostream>
26
#include <iterator>
27
#include <limits>
28
29
#include "common/cast_set.h"
30
#include "exec/common/endian.h"
31
#include "util/bit_util.h"
32
#include "util/coding.h"
33
34
namespace doris {
35
#include "common/compile_check_begin.h"
36
37
template <typename T>
38
4.17M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
4.17M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
4.17M
    _buffered_values_num += count;
41
4.17M
    p_data += count;
42
4.17M
    return p_data;
43
4.17M
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10copy_valueEPKam
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10copy_valueEPKsm
_ZN5doris10ForEncoderIiE10copy_valueEPKim
Line
Count
Source
38
8
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
8
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
8
    _buffered_values_num += count;
41
8
    p_data += count;
42
8
    return p_data;
43
8
}
_ZN5doris10ForEncoderIlE10copy_valueEPKlm
Line
Count
Source
38
2.08M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
2.08M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
2.08M
    _buffered_values_num += count;
41
2.08M
    p_data += count;
42
2.08M
    return p_data;
43
2.08M
}
_ZN5doris10ForEncoderInE10copy_valueEPKnm
Line
Count
Source
38
2.08M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
2.08M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
2.08M
    _buffered_values_num += count;
41
2.08M
    p_data += count;
42
2.08M
    return p_data;
43
2.08M
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10copy_valueEPKhm
Unexecuted instantiation: _ZN5doris10ForEncoderItE10copy_valueEPKtm
_ZN5doris10ForEncoderIjE10copy_valueEPKjm
Line
Count
Source
38
3
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
3
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
3
    _buffered_values_num += count;
41
3
    p_data += count;
42
3
    return p_data;
43
3
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE10copy_valueEPKmm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10copy_valueEPKS1_m
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10copy_valueEPKom
44
45
template <typename T>
46
4.17M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
4.17M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
4.16M
        copy_value(in_data, count);
49
4.16M
        _values_num += count;
50
4.16M
        return;
51
4.16M
    }
52
53
    // 1. padding one frame
54
16.3k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
16.3k
    in_data = copy_value(in_data, padding_num);
56
16.3k
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
16.3k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
16.4k
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
8
        _buffered_values_num = FRAME_VALUE_NUM;
63
8
        bit_packing_one_frame_value(in_data);
64
8
        in_data += FRAME_VALUE_NUM;
65
8
    }
66
67
    // 3. process remaining value
68
16.3k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
16.3k
    if (remaining_num > 0) {
70
4
        copy_value(in_data, remaining_num);
71
4
    }
72
73
16.3k
    _values_num += count;
74
16.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE9put_batchEPKam
Unexecuted instantiation: _ZN5doris10ForEncoderIsE9put_batchEPKsm
_ZN5doris10ForEncoderIiE9put_batchEPKim
Line
Count
Source
46
7
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
7
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
4
        copy_value(in_data, count);
49
4
        _values_num += count;
50
4
        return;
51
4
    }
52
53
    // 1. padding one frame
54
3
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
3
    in_data = copy_value(in_data, padding_num);
56
3
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
3
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
5
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
2
        _buffered_values_num = FRAME_VALUE_NUM;
63
2
        bit_packing_one_frame_value(in_data);
64
2
        in_data += FRAME_VALUE_NUM;
65
2
    }
66
67
    // 3. process remaining value
68
3
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
3
    if (remaining_num > 0) {
70
1
        copy_value(in_data, remaining_num);
71
1
    }
72
73
3
    _values_num += count;
74
3
}
_ZN5doris10ForEncoderIlE9put_batchEPKlm
Line
Count
Source
46
2.08M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
2.08M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
2.08M
        copy_value(in_data, count);
49
2.08M
        _values_num += count;
50
2.08M
        return;
51
2.08M
    }
52
53
    // 1. padding one frame
54
8.19k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
8.19k
    in_data = copy_value(in_data, padding_num);
56
8.19k
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
8.19k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
8.19k
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
3
        _buffered_values_num = FRAME_VALUE_NUM;
63
3
        bit_packing_one_frame_value(in_data);
64
3
        in_data += FRAME_VALUE_NUM;
65
3
    }
66
67
    // 3. process remaining value
68
8.19k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
8.19k
    if (remaining_num > 0) {
70
3
        copy_value(in_data, remaining_num);
71
3
    }
72
73
8.19k
    _values_num += count;
74
8.19k
}
_ZN5doris10ForEncoderInE9put_batchEPKnm
Line
Count
Source
46
2.08M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
2.08M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
2.08M
        copy_value(in_data, count);
49
2.08M
        _values_num += count;
50
2.08M
        return;
51
2.08M
    }
52
53
    // 1. padding one frame
54
8.19k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
8.19k
    in_data = copy_value(in_data, padding_num);
56
8.19k
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
8.19k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
8.19k
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
0
        _buffered_values_num = FRAME_VALUE_NUM;
63
0
        bit_packing_one_frame_value(in_data);
64
0
        in_data += FRAME_VALUE_NUM;
65
0
    }
66
67
    // 3. process remaining value
68
8.19k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
8.19k
    if (remaining_num > 0) {
70
0
        copy_value(in_data, remaining_num);
71
0
    }
72
73
8.19k
    _values_num += count;
74
8.19k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE9put_batchEPKhm
Unexecuted instantiation: _ZN5doris10ForEncoderItE9put_batchEPKtm
_ZN5doris10ForEncoderIjE9put_batchEPKjm
Line
Count
Source
46
3
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
3
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
0
        copy_value(in_data, count);
49
0
        _values_num += count;
50
0
        return;
51
0
    }
52
53
    // 1. padding one frame
54
3
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
3
    in_data = copy_value(in_data, padding_num);
56
3
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
3
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
6
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
3
        _buffered_values_num = FRAME_VALUE_NUM;
63
3
        bit_packing_one_frame_value(in_data);
64
3
        in_data += FRAME_VALUE_NUM;
65
3
    }
66
67
    // 3. process remaining value
68
3
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
3
    if (remaining_num > 0) {
70
0
        copy_value(in_data, remaining_num);
71
0
    }
72
73
3
    _values_num += count;
74
3
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE9put_batchEPKmm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE9put_batchEPKS1_m
Unexecuted instantiation: _ZN5doris10ForEncoderIoE9put_batchEPKom
75
76
// todo(kks): improve this method by SIMD instructions
77
78
template <typename T>
79
15.3k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
15.3k
    int64_t s = 0;
81
15.3k
    uint8_t output_mask = 255;
82
15.3k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
15.3k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
237k
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
222k
        s |= static_cast<int64_t>(input[i + 7]);
88
222k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
222k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
222k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
222k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
222k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
222k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
222k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
1.22M
        for (int j = 0; j < bit_width; j++) {
100
1.00M
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
1.00M
        }
102
222k
        output += bit_width;
103
222k
        s = 0;
104
222k
    }
105
106
    // remainder
107
15.3k
    int byte = tail_count * bit_width; // How many bits are left to store
108
15.3k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
65.4k
    for (int i = 0; i < tail_count; i++) {
112
50.1k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
50.1k
             << ((tail_count - i - 1) * bit_width);
114
50.1k
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
15.3k
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
48.4k
    for (int i = 0; i < bytes; i++) {
123
33.1k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
33.1k
    }
125
15.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_8EPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_8EPKshiPh
_ZN5doris10ForEncoderIiE10bit_pack_8EPKihiPh
Line
Count
Source
79
8
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
8
    int64_t s = 0;
81
8
    uint8_t output_mask = 255;
82
8
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
8
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
104
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
96
        s |= static_cast<int64_t>(input[i + 7]);
88
96
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
96
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
96
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
96
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
96
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
96
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
96
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
192
        for (int j = 0; j < bit_width; j++) {
100
96
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
96
        }
102
96
        output += bit_width;
103
96
        s = 0;
104
96
    }
105
106
    // remainder
107
8
    int byte = tail_count * bit_width; // How many bits are left to store
108
8
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
10
    for (int i = 0; i < tail_count; i++) {
112
2
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
2
             << ((tail_count - i - 1) * bit_width);
114
2
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
8
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
9
    for (int i = 0; i < bytes; i++) {
123
1
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
1
    }
125
8
}
_ZN5doris10ForEncoderIlE10bit_pack_8EPKlhiPh
Line
Count
Source
79
3.05k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
3.05k
    int64_t s = 0;
81
3.05k
    uint8_t output_mask = 255;
82
3.05k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
3.05k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
34.9k
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
31.8k
        s |= static_cast<int64_t>(input[i + 7]);
88
31.8k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
31.8k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
31.8k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
31.8k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
31.8k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
31.8k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
31.8k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
174k
        for (int j = 0; j < bit_width; j++) {
100
142k
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
142k
        }
102
31.8k
        output += bit_width;
103
31.8k
        s = 0;
104
31.8k
    }
105
106
    // remainder
107
3.05k
    int byte = tail_count * bit_width; // How many bits are left to store
108
3.05k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
10.2k
    for (int i = 0; i < tail_count; i++) {
112
7.16k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
7.16k
             << ((tail_count - i - 1) * bit_width);
114
7.16k
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
3.05k
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
7.77k
    for (int i = 0; i < bytes; i++) {
123
4.72k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
4.72k
    }
125
3.05k
}
_ZN5doris10ForEncoderInE10bit_pack_8EPKnhiPh
Line
Count
Source
79
12.2k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
12.2k
    int64_t s = 0;
81
12.2k
    uint8_t output_mask = 255;
82
12.2k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
12.2k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
202k
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
190k
        s |= static_cast<int64_t>(input[i + 7]);
88
190k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
190k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
190k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
190k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
190k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
190k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
190k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
1.04M
        for (int j = 0; j < bit_width; j++) {
100
857k
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
857k
        }
102
190k
        output += bit_width;
103
190k
        s = 0;
104
190k
    }
105
106
    // remainder
107
12.2k
    int byte = tail_count * bit_width; // How many bits are left to store
108
12.2k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
55.2k
    for (int i = 0; i < tail_count; i++) {
112
43.0k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
43.0k
             << ((tail_count - i - 1) * bit_width);
114
43.0k
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
12.2k
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
40.6k
    for (int i = 0; i < bytes; i++) {
123
28.4k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
28.4k
    }
125
12.2k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_8EPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_8EPKthiPh
_ZN5doris10ForEncoderIjE10bit_pack_8EPKjhiPh
Line
Count
Source
79
6
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
6
    int64_t s = 0;
81
6
    uint8_t output_mask = 255;
82
6
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
6
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
102
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
96
        s |= static_cast<int64_t>(input[i + 7]);
88
96
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
96
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
96
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
96
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
96
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
96
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
96
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
192
        for (int j = 0; j < bit_width; j++) {
100
96
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
96
        }
102
96
        output += bit_width;
103
96
        s = 0;
104
96
    }
105
106
    // remainder
107
6
    int byte = tail_count * bit_width; // How many bits are left to store
108
6
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
6
    for (int i = 0; i < tail_count; i++) {
112
0
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
0
             << ((tail_count - i - 1) * bit_width);
114
0
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
6
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
6
    for (int i = 0; i < bytes; i++) {
123
0
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
0
    }
125
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_8EPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_8EPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_8EPKohiPh
126
127
template <typename T>
128
template <typename U>
129
45.8k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
45.8k
    U s = 0;
131
45.8k
    uint8_t output_mask = 255;
132
45.8k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
45.8k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
45.8k
    int output_size = 0;                      // How many outputs can be processed at a time
135
45.8k
    int bit_width_remainder =
136
45.8k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
45.8k
    int extra_bit = 0;            // Extra bits after each process
138
139
1.40M
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
1.35M
        s <<= bit_width;
143
1.35M
        s |= (static_cast<U>(input[i]));
144
1.35M
        s <<= bit_width;
145
1.35M
        s |= (static_cast<U>(input[i + 1]));
146
1.35M
        s <<= bit_width;
147
1.35M
        s |= (static_cast<U>(input[i + 2]));
148
1.35M
        s <<= bit_width;
149
1.35M
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
1.35M
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
1.35M
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
15.2M
        for (int j = 0; j < output_size; j++) {
166
13.8M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
13.8M
        }
168
1.35M
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
1.35M
        s &= (1 << extra_bit) - 1;
172
1.35M
    }
173
174
    // remainder
175
45.8k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
45.8k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
45.8k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
110k
    for (int i = 0; i < tail_count; i++) {
181
64.4k
        s <<= bit_width;
182
64.4k
        s |= (input[i + full_batch_size]);
183
64.4k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
45.8k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
231k
    for (int i = 0; i < bytes; i++) {
192
185k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
185k
    }
194
45.8k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4IlEEvPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4InEEvPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4IlEEvPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4InEEvPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4IlEEvPKihiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4InEEvPKihiPh
_ZN5doris10ForEncoderIlE10bit_pack_4IlEEvPKlhiPh
Line
Count
Source
129
3.03k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
3.03k
    U s = 0;
131
3.03k
    uint8_t output_mask = 255;
132
3.03k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
3.03k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
3.03k
    int output_size = 0;                      // How many outputs can be processed at a time
135
3.03k
    int bit_width_remainder =
136
3.03k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
3.03k
    int extra_bit = 0;            // Extra bits after each process
138
139
67.5k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
64.5k
        s <<= bit_width;
143
64.5k
        s |= (static_cast<U>(input[i]));
144
64.5k
        s <<= bit_width;
145
64.5k
        s |= (static_cast<U>(input[i + 1]));
146
64.5k
        s <<= bit_width;
147
64.5k
        s |= (static_cast<U>(input[i + 2]));
148
64.5k
        s <<= bit_width;
149
64.5k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
64.5k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
64.5k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
467k
        for (int j = 0; j < output_size; j++) {
166
402k
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
402k
        }
168
64.5k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
64.5k
        s &= (1 << extra_bit) - 1;
172
64.5k
    }
173
174
    // remainder
175
3.03k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
3.03k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
3.03k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
6.08k
    for (int i = 0; i < tail_count; i++) {
181
3.04k
        s <<= bit_width;
182
3.04k
        s |= (input[i + full_batch_size]);
183
3.04k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
3.03k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
8.75k
    for (int i = 0; i < bytes; i++) {
192
5.71k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
5.71k
    }
194
3.03k
}
_ZN5doris10ForEncoderIlE10bit_pack_4InEEvPKlhiPh
Line
Count
Source
129
6.08k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
6.08k
    U s = 0;
131
6.08k
    uint8_t output_mask = 255;
132
6.08k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
6.08k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
6.08k
    int output_size = 0;                      // How many outputs can be processed at a time
135
6.08k
    int bit_width_remainder =
136
6.08k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
6.08k
    int extra_bit = 0;            // Extra bits after each process
138
139
135k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
129k
        s <<= bit_width;
143
129k
        s |= (static_cast<U>(input[i]));
144
129k
        s <<= bit_width;
145
129k
        s |= (static_cast<U>(input[i + 1]));
146
129k
        s <<= bit_width;
147
129k
        s |= (static_cast<U>(input[i + 2]));
148
129k
        s <<= bit_width;
149
129k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
129k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
129k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
1.70M
        for (int j = 0; j < output_size; j++) {
166
1.58M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
1.58M
        }
168
129k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
129k
        s &= (1 << extra_bit) - 1;
172
129k
    }
173
174
    // remainder
175
6.08k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
6.08k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
6.08k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
12.2k
    for (int i = 0; i < tail_count; i++) {
181
6.12k
        s <<= bit_width;
182
6.12k
        s |= (input[i + full_batch_size]);
183
6.12k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
6.08k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
26.7k
    for (int i = 0; i < bytes; i++) {
192
20.6k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
20.6k
    }
194
6.08k
}
_ZN5doris10ForEncoderInE10bit_pack_4IlEEvPKnhiPh
Line
Count
Source
129
12.2k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
12.2k
    U s = 0;
131
12.2k
    uint8_t output_mask = 255;
132
12.2k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
12.2k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
12.2k
    int output_size = 0;                      // How many outputs can be processed at a time
135
12.2k
    int bit_width_remainder =
136
12.2k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
12.2k
    int extra_bit = 0;            // Extra bits after each process
138
139
399k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
387k
        s <<= bit_width;
143
387k
        s |= (static_cast<U>(input[i]));
144
387k
        s <<= bit_width;
145
387k
        s |= (static_cast<U>(input[i + 1]));
146
387k
        s <<= bit_width;
147
387k
        s |= (static_cast<U>(input[i + 2]));
148
387k
        s <<= bit_width;
149
387k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
387k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
387k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
2.80M
        for (int j = 0; j < output_size; j++) {
166
2.41M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
2.41M
        }
168
387k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
387k
        s &= (1 << extra_bit) - 1;
172
387k
    }
173
174
    // remainder
175
12.2k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
12.2k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
12.2k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
30.6k
    for (int i = 0; i < tail_count; i++) {
181
18.4k
        s <<= bit_width;
182
18.4k
        s |= (input[i + full_batch_size]);
183
18.4k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
12.2k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
46.8k
    for (int i = 0; i < bytes; i++) {
192
34.5k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
34.5k
    }
194
12.2k
}
_ZN5doris10ForEncoderInE10bit_pack_4InEEvPKnhiPh
Line
Count
Source
129
24.4k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
24.4k
    U s = 0;
131
24.4k
    uint8_t output_mask = 255;
132
24.4k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
24.4k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
24.4k
    int output_size = 0;                      // How many outputs can be processed at a time
135
24.4k
    int bit_width_remainder =
136
24.4k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
24.4k
    int extra_bit = 0;            // Extra bits after each process
138
139
798k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
774k
        s <<= bit_width;
143
774k
        s |= (static_cast<U>(input[i]));
144
774k
        s <<= bit_width;
145
774k
        s |= (static_cast<U>(input[i + 1]));
146
774k
        s <<= bit_width;
147
774k
        s |= (static_cast<U>(input[i + 2]));
148
774k
        s <<= bit_width;
149
774k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
774k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
774k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
10.2M
        for (int j = 0; j < output_size; j++) {
166
9.48M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
9.48M
        }
168
774k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
774k
        s &= (1 << extra_bit) - 1;
172
774k
    }
173
174
    // remainder
175
24.4k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
24.4k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
24.4k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
61.3k
    for (int i = 0; i < tail_count; i++) {
181
36.8k
        s <<= bit_width;
182
36.8k
        s |= (input[i + full_batch_size]);
183
36.8k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
24.4k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
148k
    for (int i = 0; i < bytes; i++) {
192
124k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
124k
    }
194
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4IlEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4InEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4IlEEvPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4InEEvPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4IlEEvPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4InEEvPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4IlEEvPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4InEEvPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4IlEEvPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4InEEvPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4IlEEvPKohiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4InEEvPKohiPh
195
196
template <typename T>
197
181k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
198
181k
    int output_mask = 255;
199
181k
    int need_bit = 0; // still need
200
201
21.9M
    for (int i = 0; i < in_num; i++) {
202
21.7M
        T x = input[i];
203
21.7M
        int width = bit_width;
204
21.7M
        if (need_bit) {
205
            // The last time we take away the high 8 - need_bit,
206
            // we need to make up the rest of the need_bit from the width.
207
            // Use width - need_bit to compute high need_bit bits
208
15.0M
            *output |= x >> (width - need_bit);
209
15.0M
            output++;
210
            // There are need_bit bits being used, so subtract
211
15.0M
            width -= need_bit;
212
15.0M
        }
213
21.7M
        int num = width >> 3;      // How many outputs can be processed at a time
214
21.7M
        int remainder = width & 7; // How many bits are left to store
215
216
        // Starting with the highest valid bit, take out 8 bits in sequence
217
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
218
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
219
        // But since there are still remainder bits that can't be processed, need to add the remainder
220
223M
        for (int j = 0; j < num; j++) {
221
202M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
222
202M
            output++;
223
202M
        }
224
21.7M
        if (remainder) {
225
            // Process the last remaining remainder bit.
226
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
227
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
228
15.1M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
229
            // Already have remainder bits, next time need 8-remainder bits
230
15.1M
            need_bit = 8 - remainder;
231
15.1M
        } else {
232
6.57M
            need_bit = 0;
233
6.57M
        }
234
21.7M
    }
235
181k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_1EPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_1EPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_1EPKihiPh
_ZN5doris10ForEncoderIlE10bit_pack_1EPKlhiPh
Line
Count
Source
197
12.1k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
198
12.1k
    int output_mask = 255;
199
12.1k
    int need_bit = 0; // still need
200
201
1.05M
    for (int i = 0; i < in_num; i++) {
202
1.04M
        T x = input[i];
203
1.04M
        int width = bit_width;
204
1.04M
        if (need_bit) {
205
            // The last time we take away the high 8 - need_bit,
206
            // we need to make up the rest of the need_bit from the width.
207
            // Use width - need_bit to compute high need_bit bits
208
743k
            *output |= x >> (width - need_bit);
209
743k
            output++;
210
            // There are need_bit bits being used, so subtract
211
743k
            width -= need_bit;
212
743k
        }
213
1.04M
        int num = width >> 3;      // How many outputs can be processed at a time
214
1.04M
        int remainder = width & 7; // How many bits are left to store
215
216
        // Starting with the highest valid bit, take out 8 bits in sequence
217
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
218
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
219
        // But since there are still remainder bits that can't be processed, need to add the remainder
220
6.62M
        for (int j = 0; j < num; j++) {
221
5.58M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
222
5.58M
            output++;
223
5.58M
        }
224
1.04M
        if (remainder) {
225
            // Process the last remaining remainder bit.
226
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
227
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
228
749k
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
229
            // Already have remainder bits, next time need 8-remainder bits
230
749k
            need_bit = 8 - remainder;
231
749k
        } else {
232
294k
            need_bit = 0;
233
294k
        }
234
1.04M
    }
235
12.1k
}
_ZN5doris10ForEncoderInE10bit_pack_1EPKnhiPh
Line
Count
Source
197
169k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
198
169k
    int output_mask = 255;
199
169k
    int need_bit = 0; // still need
200
201
20.8M
    for (int i = 0; i < in_num; i++) {
202
20.6M
        T x = input[i];
203
20.6M
        int width = bit_width;
204
20.6M
        if (need_bit) {
205
            // The last time we take away the high 8 - need_bit,
206
            // we need to make up the rest of the need_bit from the width.
207
            // Use width - need_bit to compute high need_bit bits
208
14.3M
            *output |= x >> (width - need_bit);
209
14.3M
            output++;
210
            // There are need_bit bits being used, so subtract
211
14.3M
            width -= need_bit;
212
14.3M
        }
213
20.6M
        int num = width >> 3;      // How many outputs can be processed at a time
214
20.6M
        int remainder = width & 7; // How many bits are left to store
215
216
        // Starting with the highest valid bit, take out 8 bits in sequence
217
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
218
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
219
        // But since there are still remainder bits that can't be processed, need to add the remainder
220
217M
        for (int j = 0; j < num; j++) {
221
196M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
222
196M
            output++;
223
196M
        }
224
20.6M
        if (remainder) {
225
            // Process the last remaining remainder bit.
226
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
227
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
228
14.4M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
229
            // Already have remainder bits, next time need 8-remainder bits
230
14.4M
            need_bit = 8 - remainder;
231
14.4M
        } else {
232
6.27M
            need_bit = 0;
233
6.27M
        }
234
20.6M
    }
235
169k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_1EPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_1EPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_1EPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_1EPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_1EPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_1EPKohiPh
236
237
// Use as few bit as possible to store a piece of integer data.
238
// param[in] input: the integer list need to pack
239
// param[in] in_num: the number integer need to pack
240
// param[in] bit_width: how many bit we use to store each integer data
241
// param[out] out: the packed result
242
243
// For example:
244
// The input is int32 list: 1, 2, 4, 8 and bit_width is 4
245
// The output will be: 0001 0010 0100 1000
246
template <typename T>
247
243k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
243k
    if (in_num == 0 || bit_width == 0) {
249
260
        return;
250
260
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
242k
    if (bit_width <= 8) {
259
15.3k
        bit_pack_8(input, in_num, bit_width, output);
260
227k
    } else if (bit_width <= 16) {
261
15.2k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
212k
    } else if (bit_width <= 32) {
263
30.5k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
181k
    } else {
265
181k
        bit_pack_1(input, in_num, bit_width, output);
266
181k
    }
267
242k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE8bit_packEPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE8bit_packEPKshiPh
_ZN5doris10ForEncoderIiE8bit_packEPKihiPh
Line
Count
Source
247
9
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
9
    if (in_num == 0 || bit_width == 0) {
249
1
        return;
250
1
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
8
    if (bit_width <= 8) {
259
8
        bit_pack_8(input, in_num, bit_width, output);
260
8
    } else if (bit_width <= 16) {
261
0
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
0
    } else if (bit_width <= 32) {
263
0
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
0
    } else {
265
0
        bit_pack_1(input, in_num, bit_width, output);
266
0
    }
267
8
}
_ZN5doris10ForEncoderIlE8bit_packEPKlhiPh
Line
Count
Source
247
24.4k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
24.4k
    if (in_num == 0 || bit_width == 0) {
249
131
        return;
250
131
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
24.3k
    if (bit_width <= 8) {
259
3.05k
        bit_pack_8(input, in_num, bit_width, output);
260
21.2k
    } else if (bit_width <= 16) {
261
3.03k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
18.2k
    } else if (bit_width <= 32) {
263
6.08k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
12.1k
    } else {
265
12.1k
        bit_pack_1(input, in_num, bit_width, output);
266
12.1k
    }
267
24.3k
}
_ZN5doris10ForEncoderInE8bit_packEPKnhiPh
Line
Count
Source
247
218k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
218k
    if (in_num == 0 || bit_width == 0) {
249
128
        return;
250
128
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
218k
    if (bit_width <= 8) {
259
12.2k
        bit_pack_8(input, in_num, bit_width, output);
260
206k
    } else if (bit_width <= 16) {
261
12.2k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
194k
    } else if (bit_width <= 32) {
263
24.4k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
169k
    } else {
265
169k
        bit_pack_1(input, in_num, bit_width, output);
266
169k
    }
267
218k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE8bit_packEPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE8bit_packEPKthiPh
_ZN5doris10ForEncoderIjE8bit_packEPKjhiPh
Line
Count
Source
247
6
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
6
    if (in_num == 0 || bit_width == 0) {
249
0
        return;
250
0
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
6
    if (bit_width <= 8) {
259
6
        bit_pack_8(input, in_num, bit_width, output);
260
6
    } else if (bit_width <= 16) {
261
0
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
0
    } else if (bit_width <= 32) {
263
0
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
0
    } else {
265
0
        bit_pack_1(input, in_num, bit_width, output);
266
0
    }
267
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE8bit_packEPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE8bit_packEPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE8bit_packEPKohiPh
268
269
template <typename T>
270
48.9k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
48.9k
    T min = input[0];
272
48.9k
    T max = input[0];
273
48.9k
    bool is_ascending = true;
274
48.9k
    uint8_t bit_width = 0;
275
48.9k
    T half_max_delta = numeric_limits_max() >> 1;
276
48.9k
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
4.18M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
4.13M
        if (is_ascending) {
281
86.4k
            if (input[i] < input[i - 1]) {
282
48.4k
                is_ascending = false;
283
48.4k
            } else {
284
38.0k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
38.0k
                } else {
287
38.0k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
38.0k
                }
289
38.0k
            }
290
86.4k
        }
291
292
4.13M
        if (input[i] < min) {
293
180k
            min = input[i];
294
180k
            continue;
295
180k
        }
296
297
3.95M
        if (input[i] > max) {
298
183k
            max = input[i];
299
183k
        }
300
3.95M
    }
301
48.9k
    if (!is_ascending) {
302
48.4k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
48.4k
    }
306
307
    // 2. save min value.
308
48.9k
    if (sizeof(T) == 16) {
309
24.4k
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
24.4k
    } else if (sizeof(T) == 8) {
311
24.4k
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
24.4k
    } else {
313
15
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
15
    }
315
316
    // 3.1 save original value.
317
48.9k
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
48.9k
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
48.9k
        T delta_values[FRAME_VALUE_NUM];
328
48.9k
        if (is_ascending) {
329
449
            delta_values[0] = 0;
330
3.17k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
2.72k
                delta_values[i] = input[i] - input[i - 1];
332
2.72k
            }
333
48.4k
        } else {
334
48.4k
            bit_width = bits(static_cast<T>(max - min));
335
4.22M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
4.17M
                delta_values[i] = input[i] - min;
337
4.17M
            }
338
48.4k
        }
339
340
48.9k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
48.9k
        _buffer->reserve(_buffer->size() + packing_len);
343
48.9k
        size_t origin_size = _buffer->size();
344
48.9k
        _buffer->resize(origin_size + packing_len);
345
48.9k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
48.9k
    }
347
48.9k
    uint8_t storage_format = 0;
348
48.9k
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
48.9k
    } else if (is_ascending) {
351
449
        storage_format = 1;
352
449
    }
353
48.9k
    _storage_formats.push_back(storage_format);
354
48.9k
    _bit_widths.push_back(bit_width);
355
356
48.9k
    _buffered_values_num = 0;
357
48.9k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE27bit_packing_one_frame_valueEPKa
Unexecuted instantiation: _ZN5doris10ForEncoderIsE27bit_packing_one_frame_valueEPKs
_ZN5doris10ForEncoderIiE27bit_packing_one_frame_valueEPKi
Line
Count
Source
270
9
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
9
    T min = input[0];
272
9
    T max = input[0];
273
9
    bool is_ascending = true;
274
9
    uint8_t bit_width = 0;
275
9
    T half_max_delta = numeric_limits_max() >> 1;
276
9
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
771
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
762
        if (is_ascending) {
281
762
            if (input[i] < input[i - 1]) {
282
0
                is_ascending = false;
283
762
            } else {
284
762
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
762
                } else {
287
762
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
762
                }
289
762
            }
290
762
        }
291
292
762
        if (input[i] < min) {
293
0
            min = input[i];
294
0
            continue;
295
0
        }
296
297
762
        if (input[i] > max) {
298
762
            max = input[i];
299
762
        }
300
762
    }
301
9
    if (!is_ascending) {
302
0
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
0
    }
306
307
    // 2. save min value.
308
9
    if (sizeof(T) == 16) {
309
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
9
    } else if (sizeof(T) == 8) {
311
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
9
    } else {
313
9
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
9
    }
315
316
    // 3.1 save original value.
317
9
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
9
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
9
        T delta_values[FRAME_VALUE_NUM];
328
9
        if (is_ascending) {
329
9
            delta_values[0] = 0;
330
771
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
762
                delta_values[i] = input[i] - input[i - 1];
332
762
            }
333
9
        } else {
334
0
            bit_width = bits(static_cast<T>(max - min));
335
0
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
0
                delta_values[i] = input[i] - min;
337
0
            }
338
0
        }
339
340
9
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
9
        _buffer->reserve(_buffer->size() + packing_len);
343
9
        size_t origin_size = _buffer->size();
344
9
        _buffer->resize(origin_size + packing_len);
345
9
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
9
    }
347
9
    uint8_t storage_format = 0;
348
9
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
9
    } else if (is_ascending) {
351
9
        storage_format = 1;
352
9
    }
353
9
    _storage_formats.push_back(storage_format);
354
9
    _bit_widths.push_back(bit_width);
355
356
9
    _buffered_values_num = 0;
357
9
}
_ZN5doris10ForEncoderIlE27bit_packing_one_frame_valueEPKl
Line
Count
Source
270
24.4k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
24.4k
    T min = input[0];
272
24.4k
    T max = input[0];
273
24.4k
    bool is_ascending = true;
274
24.4k
    uint8_t bit_width = 0;
275
24.4k
    T half_max_delta = numeric_limits_max() >> 1;
276
24.4k
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
2.08M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
2.06M
        if (is_ascending) {
281
43.3k
            if (input[i] < input[i - 1]) {
282
24.2k
                is_ascending = false;
283
24.2k
            } else {
284
19.1k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
19.1k
                } else {
287
19.1k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
19.1k
                }
289
19.1k
            }
290
43.3k
        }
291
292
2.06M
        if (input[i] < min) {
293
88.0k
            min = input[i];
294
88.0k
            continue;
295
88.0k
        }
296
297
1.97M
        if (input[i] > max) {
298
89.5k
            max = input[i];
299
89.5k
        }
300
1.97M
    }
301
24.4k
    if (!is_ascending) {
302
24.2k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
24.2k
    }
306
307
    // 2. save min value.
308
24.4k
    if (sizeof(T) == 16) {
309
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
24.4k
    } else if (sizeof(T) == 8) {
311
24.4k
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
24.4k
    } else {
313
0
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
0
    }
315
316
    // 3.1 save original value.
317
24.4k
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
24.4k
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
24.4k
        T delta_values[FRAME_VALUE_NUM];
328
24.4k
        if (is_ascending) {
329
220
            delta_values[0] = 0;
330
1.29k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
1.07k
                delta_values[i] = input[i] - input[i - 1];
332
1.07k
            }
333
24.2k
        } else {
334
24.2k
            bit_width = bits(static_cast<T>(max - min));
335
2.11M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
2.08M
                delta_values[i] = input[i] - min;
337
2.08M
            }
338
24.2k
        }
339
340
24.4k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
24.4k
        _buffer->reserve(_buffer->size() + packing_len);
343
24.4k
        size_t origin_size = _buffer->size();
344
24.4k
        _buffer->resize(origin_size + packing_len);
345
24.4k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
24.4k
    }
347
24.4k
    uint8_t storage_format = 0;
348
24.4k
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
24.4k
    } else if (is_ascending) {
351
220
        storage_format = 1;
352
220
    }
353
24.4k
    _storage_formats.push_back(storage_format);
354
24.4k
    _bit_widths.push_back(bit_width);
355
356
24.4k
    _buffered_values_num = 0;
357
24.4k
}
_ZN5doris10ForEncoderInE27bit_packing_one_frame_valueEPKn
Line
Count
Source
270
24.4k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
24.4k
    T min = input[0];
272
24.4k
    T max = input[0];
273
24.4k
    bool is_ascending = true;
274
24.4k
    uint8_t bit_width = 0;
275
24.4k
    T half_max_delta = numeric_limits_max() >> 1;
276
24.4k
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
2.08M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
2.06M
        if (is_ascending) {
281
41.5k
            if (input[i] < input[i - 1]) {
282
24.2k
                is_ascending = false;
283
24.2k
            } else {
284
17.3k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
17.3k
                } else {
287
17.3k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
17.3k
                }
289
17.3k
            }
290
41.5k
        }
291
292
2.06M
        if (input[i] < min) {
293
92.7k
            min = input[i];
294
92.7k
            continue;
295
92.7k
        }
296
297
1.97M
        if (input[i] > max) {
298
92.5k
            max = input[i];
299
92.5k
        }
300
1.97M
    }
301
24.4k
    if (!is_ascending) {
302
24.2k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
24.2k
    }
306
307
    // 2. save min value.
308
24.4k
    if (sizeof(T) == 16) {
309
24.4k
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
24.4k
    } else if (sizeof(T) == 8) {
311
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
0
    } else {
313
0
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
0
    }
315
316
    // 3.1 save original value.
317
24.4k
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
24.4k
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
24.4k
        T delta_values[FRAME_VALUE_NUM];
328
24.4k
        if (is_ascending) {
329
214
            delta_values[0] = 0;
330
338
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
124
                delta_values[i] = input[i] - input[i - 1];
332
124
            }
333
24.2k
        } else {
334
24.2k
            bit_width = bits(static_cast<T>(max - min));
335
2.11M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
2.08M
                delta_values[i] = input[i] - min;
337
2.08M
            }
338
24.2k
        }
339
340
24.4k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
24.4k
        _buffer->reserve(_buffer->size() + packing_len);
343
24.4k
        size_t origin_size = _buffer->size();
344
24.4k
        _buffer->resize(origin_size + packing_len);
345
24.4k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
24.4k
    }
347
24.4k
    uint8_t storage_format = 0;
348
24.4k
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
24.4k
    } else if (is_ascending) {
351
214
        storage_format = 1;
352
214
    }
353
24.4k
    _storage_formats.push_back(storage_format);
354
24.4k
    _bit_widths.push_back(bit_width);
355
356
24.4k
    _buffered_values_num = 0;
357
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE27bit_packing_one_frame_valueEPKh
Unexecuted instantiation: _ZN5doris10ForEncoderItE27bit_packing_one_frame_valueEPKt
_ZN5doris10ForEncoderIjE27bit_packing_one_frame_valueEPKj
Line
Count
Source
270
6
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
6
    T min = input[0];
272
6
    T max = input[0];
273
6
    bool is_ascending = true;
274
6
    uint8_t bit_width = 0;
275
6
    T half_max_delta = numeric_limits_max() >> 1;
276
6
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
768
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
762
        if (is_ascending) {
281
762
            if (input[i] < input[i - 1]) {
282
0
                is_ascending = false;
283
762
            } else {
284
762
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
762
                } else {
287
762
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
762
                }
289
762
            }
290
762
        }
291
292
762
        if (input[i] < min) {
293
0
            min = input[i];
294
0
            continue;
295
0
        }
296
297
762
        if (input[i] > max) {
298
762
            max = input[i];
299
762
        }
300
762
    }
301
6
    if (!is_ascending) {
302
0
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
0
    }
306
307
    // 2. save min value.
308
6
    if (sizeof(T) == 16) {
309
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
6
    } else if (sizeof(T) == 8) {
311
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
6
    } else {
313
6
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
6
    }
315
316
    // 3.1 save original value.
317
6
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
6
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
6
        T delta_values[FRAME_VALUE_NUM];
328
6
        if (is_ascending) {
329
6
            delta_values[0] = 0;
330
768
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
762
                delta_values[i] = input[i] - input[i - 1];
332
762
            }
333
6
        } else {
334
0
            bit_width = bits(static_cast<T>(max - min));
335
0
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
0
                delta_values[i] = input[i] - min;
337
0
            }
338
0
        }
339
340
6
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
6
        _buffer->reserve(_buffer->size() + packing_len);
343
6
        size_t origin_size = _buffer->size();
344
6
        _buffer->resize(origin_size + packing_len);
345
6
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
6
    }
347
6
    uint8_t storage_format = 0;
348
6
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
6
    } else if (is_ascending) {
351
6
        storage_format = 1;
352
6
    }
353
6
    _storage_formats.push_back(storage_format);
354
6
    _bit_widths.push_back(bit_width);
355
356
6
    _buffered_values_num = 0;
357
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE27bit_packing_one_frame_valueEPKm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE27bit_packing_one_frame_valueEPKS1_
Unexecuted instantiation: _ZN5doris10ForEncoderIoE27bit_packing_one_frame_valueEPKo
358
359
template <typename T>
360
32.6k
uint32_t ForEncoder<T>::flush() {
361
32.6k
    if (_buffered_values_num != 0) {
362
32.5k
        bit_packing_one_frame_value(_buffered_values);
363
32.5k
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
32.6k
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
81.5k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
48.9k
        _buffer->append(&_storage_formats[i], 1);
371
48.9k
        _buffer->append(&_bit_widths[i], 1);
372
48.9k
    }
373
    // 2 frame_value_num and values_num
374
32.6k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
32.6k
    _buffer->append(&frame_value_num, 1);
376
32.6k
    put_fixed32_le(_buffer, _values_num);
377
378
32.6k
    return cast_set<uint32_t>(_buffer->size());
379
32.6k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderIsE5flushEv
_ZN5doris10ForEncoderIiE5flushEv
Line
Count
Source
360
7
uint32_t ForEncoder<T>::flush() {
361
7
    if (_buffered_values_num != 0) {
362
4
        bit_packing_one_frame_value(_buffered_values);
363
4
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
7
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
16
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
9
        _buffer->append(&_storage_formats[i], 1);
371
9
        _buffer->append(&_bit_widths[i], 1);
372
9
    }
373
    // 2 frame_value_num and values_num
374
7
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
7
    _buffer->append(&frame_value_num, 1);
376
7
    put_fixed32_le(_buffer, _values_num);
377
378
7
    return cast_set<uint32_t>(_buffer->size());
379
7
}
_ZN5doris10ForEncoderIlE5flushEv
Line
Count
Source
360
16.3k
uint32_t ForEncoder<T>::flush() {
361
16.3k
    if (_buffered_values_num != 0) {
362
16.2k
        bit_packing_one_frame_value(_buffered_values);
363
16.2k
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
16.3k
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
40.7k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
24.4k
        _buffer->append(&_storage_formats[i], 1);
371
24.4k
        _buffer->append(&_bit_widths[i], 1);
372
24.4k
    }
373
    // 2 frame_value_num and values_num
374
16.3k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
16.3k
    _buffer->append(&frame_value_num, 1);
376
16.3k
    put_fixed32_le(_buffer, _values_num);
377
378
16.3k
    return cast_set<uint32_t>(_buffer->size());
379
16.3k
}
_ZN5doris10ForEncoderInE5flushEv
Line
Count
Source
360
16.3k
uint32_t ForEncoder<T>::flush() {
361
16.3k
    if (_buffered_values_num != 0) {
362
16.2k
        bit_packing_one_frame_value(_buffered_values);
363
16.2k
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
16.3k
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
40.7k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
24.4k
        _buffer->append(&_storage_formats[i], 1);
371
24.4k
        _buffer->append(&_bit_widths[i], 1);
372
24.4k
    }
373
    // 2 frame_value_num and values_num
374
16.3k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
16.3k
    _buffer->append(&frame_value_num, 1);
376
16.3k
    put_fixed32_le(_buffer, _values_num);
377
378
16.3k
    return cast_set<uint32_t>(_buffer->size());
379
16.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderItE5flushEv
_ZN5doris10ForEncoderIjE5flushEv
Line
Count
Source
360
3
uint32_t ForEncoder<T>::flush() {
361
3
    if (_buffered_values_num != 0) {
362
0
        bit_packing_one_frame_value(_buffered_values);
363
0
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
3
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
9
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
6
        _buffer->append(&_storage_formats[i], 1);
371
6
        _buffer->append(&_bit_widths[i], 1);
372
6
    }
373
    // 2 frame_value_num and values_num
374
3
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
3
    _buffer->append(&frame_value_num, 1);
376
3
    put_fixed32_le(_buffer, _values_num);
377
378
3
    return cast_set<uint32_t>(_buffer->size());
379
3
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderIoE5flushEv
380
381
template <typename T>
382
48.9k
const T ForEncoder<T>::numeric_limits_max() {
383
48.9k
    return std::numeric_limits<T>::max();
384
48.9k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderIsE18numeric_limits_maxEv
_ZN5doris10ForEncoderIiE18numeric_limits_maxEv
Line
Count
Source
382
9
const T ForEncoder<T>::numeric_limits_max() {
383
9
    return std::numeric_limits<T>::max();
384
9
}
_ZN5doris10ForEncoderIlE18numeric_limits_maxEv
Line
Count
Source
382
24.4k
const T ForEncoder<T>::numeric_limits_max() {
383
24.4k
    return std::numeric_limits<T>::max();
384
24.4k
}
_ZN5doris10ForEncoderInE18numeric_limits_maxEv
Line
Count
Source
382
24.4k
const T ForEncoder<T>::numeric_limits_max() {
383
24.4k
    return std::numeric_limits<T>::max();
384
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderItE18numeric_limits_maxEv
_ZN5doris10ForEncoderIjE18numeric_limits_maxEv
Line
Count
Source
382
6
const T ForEncoder<T>::numeric_limits_max() {
383
6
    return std::numeric_limits<T>::max();
384
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderIoE18numeric_limits_maxEv
385
386
template <>
387
0
const uint24_t ForEncoder<uint24_t>::numeric_limits_max() {
388
0
    return 0XFFFFFF;
389
0
}
390
391
template <typename T>
392
32.6k
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
32.6k
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
32.6k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
32.6k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
32.6k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
32.6k
    _last_frame_size =
403
32.6k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
32.6k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
32.6k
    u_int32_t frame_start_offset = 0;
409
81.5k
    for (uint32_t i = 0; i < _frame_count; i++) {
410
48.9k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
48.9k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
48.9k
        _bit_widths.push_back(bit_width);
413
48.9k
        _storage_formats.push_back(order_flag);
414
415
48.9k
        bit_width_offset += 2;
416
417
48.9k
        _frame_offsets.push_back(frame_start_offset);
418
48.9k
        if (sizeof(T) == 16) {
419
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
24.4k
        } else if (sizeof(T) == 8) {
421
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
24.4k
        } else {
423
15
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
15
        }
425
48.9k
    }
426
427
32.6k
    _out_buffer.resize(_max_frame_size);
428
32.6k
    _parsed = true;
429
430
32.6k
    return true;
431
32.6k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderIsE4initEv
_ZN5doris10ForDecoderIiE4initEv
Line
Count
Source
392
7
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
7
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
7
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
7
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
7
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
7
    _last_frame_size =
403
7
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
7
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
7
    u_int32_t frame_start_offset = 0;
409
16
    for (uint32_t i = 0; i < _frame_count; i++) {
410
9
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
9
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
9
        _bit_widths.push_back(bit_width);
413
9
        _storage_formats.push_back(order_flag);
414
415
9
        bit_width_offset += 2;
416
417
9
        _frame_offsets.push_back(frame_start_offset);
418
9
        if (sizeof(T) == 16) {
419
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
9
        } else if (sizeof(T) == 8) {
421
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
9
        } else {
423
9
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
9
        }
425
9
    }
426
427
7
    _out_buffer.resize(_max_frame_size);
428
7
    _parsed = true;
429
430
7
    return true;
431
7
}
_ZN5doris10ForDecoderIlE4initEv
Line
Count
Source
392
16.3k
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
16.3k
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
16.3k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
16.3k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
16.3k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
16.3k
    _last_frame_size =
403
16.3k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
16.3k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
16.3k
    u_int32_t frame_start_offset = 0;
409
40.7k
    for (uint32_t i = 0; i < _frame_count; i++) {
410
24.4k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
24.4k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
24.4k
        _bit_widths.push_back(bit_width);
413
24.4k
        _storage_formats.push_back(order_flag);
414
415
24.4k
        bit_width_offset += 2;
416
417
24.4k
        _frame_offsets.push_back(frame_start_offset);
418
24.4k
        if (sizeof(T) == 16) {
419
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
24.4k
        } else if (sizeof(T) == 8) {
421
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
24.4k
        } else {
423
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
0
        }
425
24.4k
    }
426
427
16.3k
    _out_buffer.resize(_max_frame_size);
428
16.3k
    _parsed = true;
429
430
16.3k
    return true;
431
16.3k
}
_ZN5doris10ForDecoderInE4initEv
Line
Count
Source
392
16.3k
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
16.3k
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
16.3k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
16.3k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
16.3k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
16.3k
    _last_frame_size =
403
16.3k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
16.3k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
16.3k
    u_int32_t frame_start_offset = 0;
409
40.7k
    for (uint32_t i = 0; i < _frame_count; i++) {
410
24.4k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
24.4k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
24.4k
        _bit_widths.push_back(bit_width);
413
24.4k
        _storage_formats.push_back(order_flag);
414
415
24.4k
        bit_width_offset += 2;
416
417
24.4k
        _frame_offsets.push_back(frame_start_offset);
418
24.4k
        if (sizeof(T) == 16) {
419
24.4k
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
24.4k
        } else if (sizeof(T) == 8) {
421
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
0
        } else {
423
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
0
        }
425
24.4k
    }
426
427
16.3k
    _out_buffer.resize(_max_frame_size);
428
16.3k
    _parsed = true;
429
430
16.3k
    return true;
431
16.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderItE4initEv
_ZN5doris10ForDecoderIjE4initEv
Line
Count
Source
392
3
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
3
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
3
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
3
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
3
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
3
    _last_frame_size =
403
3
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
3
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
3
    u_int32_t frame_start_offset = 0;
409
9
    for (uint32_t i = 0; i < _frame_count; i++) {
410
6
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
6
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
6
        _bit_widths.push_back(bit_width);
413
6
        _storage_formats.push_back(order_flag);
414
415
6
        bit_width_offset += 2;
416
417
6
        _frame_offsets.push_back(frame_start_offset);
418
6
        if (sizeof(T) == 16) {
419
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
6
        } else if (sizeof(T) == 8) {
421
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
6
        } else {
423
6
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
6
        }
425
6
    }
426
427
3
    _out_buffer.resize(_max_frame_size);
428
3
    _parsed = true;
429
430
3
    return true;
431
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderIoE4initEv
432
433
// todo(kks): improve this method by SIMD instructions
434
435
template <typename T>
436
template <typename U>
437
void ForDecoder<T>::bit_unpack_optimize(const uint8_t* input, uint8_t in_num, int bit_width,
438
81.3k
                                        T* output) {
439
81.3k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
81.3k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
81.3k
    constexpr int u_size = sizeof(U);                   // Size of U
442
81.3k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
81.3k
    int valid_bit = 0;                                  // How many valid bits
444
81.3k
    int need_bit = 0;                                   // still need
445
81.3k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
81.3k
    int full_batch_size =
447
81.3k
            cast_set<int>((input_size >> u_size_shift)
448
81.3k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
81.3k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
81.3k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
81.3k
    T output_mask;
455
81.3k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
81.3k
    } else {
458
81.3k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
81.3k
    }
460
461
81.3k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
4.48M
    for (int i = 0; i < full_batch_size; i += u_size) {
464
4.40M
        s = 0;
465
466
4.40M
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
4.40M
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
4.40M
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
21.7k
            valid_bit -= more_bit;
474
21.7k
            s >>= more_bit;
475
21.7k
        }
476
477
4.40M
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
4.09M
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
4.09M
            auto shifted = s >> (valid_bit - need_bit);
484
4.09M
            auto masked_result = shifted & mask;
485
4.09M
            if constexpr (sizeof(T) <= 4) {
486
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
4.09M
            } else {
488
4.09M
                *output |= static_cast<T>(masked_result);
489
4.09M
            }
490
4.09M
            output++;
491
4.09M
            valid_bit -= need_bit;
492
4.09M
        }
493
494
4.40M
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
4.40M
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
8.51M
        for (int j = 0; j < num; j++) {
502
4.11M
            *output =
503
4.11M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
4.11M
            output++;
505
4.11M
        }
506
507
4.40M
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
4.14M
            if constexpr (sizeof(T) <= 4) {
512
0
                auto masked_value = static_cast<T>(
513
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
4.14M
            } else {
516
4.14M
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
4.14M
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
4.14M
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
4.14M
            need_bit = bit_width - remainder;
521
4.14M
        } else {
522
257k
            need_bit = 0;
523
257k
        }
524
4.40M
    }
525
526
    // remainder
527
81.3k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
477k
        for (int i = 0; i < tail_count; i++) {
530
417k
            s <<= 8;
531
417k
            s |= input[full_batch_size + i];
532
417k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
59.2k
        valid_bit = (tail_count << 3) - more_bit;
537
59.2k
        s >>= more_bit;
538
539
        // same as before
540
59.2k
        if (need_bit) {
541
54.0k
            if constexpr (sizeof(T) <= 4) {
542
0
                *output |= static_cast<T>(static_cast<uint32_t>(
543
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
54.0k
            } else {
545
54.0k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
54.0k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
54.0k
            }
548
54.0k
            output++;
549
54.0k
            valid_bit -= need_bit;
550
54.0k
        }
551
552
59.2k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
126k
        for (int j = 0; j < num; j++) {
556
67.2k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
67.2k
            output++;
558
67.2k
        }
559
59.2k
    }
560
81.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeIlEEvPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeInEEvPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeIlEEvPKhhiPs
Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeInEEvPKhhiPs
_ZN5doris10ForDecoderIiE19bit_unpack_optimizeIlEEvPKhhiPi
Line
Count
Source
438
9
                                        T* output) {
439
9
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
9
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
9
    constexpr int u_size = sizeof(U);                   // Size of U
442
9
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
9
    int valid_bit = 0;                                  // How many valid bits
444
9
    int need_bit = 0;                                   // still need
445
9
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
9
    int full_batch_size =
447
9
            cast_set<int>((input_size >> u_size_shift)
448
9
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
9
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
9
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
9
    T output_mask;
455
9
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
9
    } else {
458
9
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
9
    }
460
461
9
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
21
    for (int i = 0; i < full_batch_size; i += u_size) {
464
12
        s = 0;
465
466
12
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
12
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
12
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
7
            valid_bit -= more_bit;
474
7
            s >>= more_bit;
475
7
        }
476
477
12
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
0
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
0
            auto shifted = s >> (valid_bit - need_bit);
484
0
            auto masked_result = shifted & mask;
485
0
            if constexpr (sizeof(T) <= 4) {
486
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
            } else {
488
                *output |= static_cast<T>(masked_result);
489
            }
490
0
            output++;
491
0
            valid_bit -= need_bit;
492
0
        }
493
494
12
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
12
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
780
        for (int j = 0; j < num; j++) {
502
768
            *output =
503
768
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
768
            output++;
505
768
        }
506
507
12
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
0
            if constexpr (sizeof(T) <= 4) {
512
0
                auto masked_value = static_cast<T>(
513
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
            } else {
516
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
0
            need_bit = bit_width - remainder;
521
12
        } else {
522
12
            need_bit = 0;
523
12
        }
524
12
    }
525
526
    // remainder
527
9
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
2
        for (int i = 0; i < tail_count; i++) {
530
1
            s <<= 8;
531
1
            s |= input[full_batch_size + i];
532
1
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
1
        valid_bit = (tail_count << 3) - more_bit;
537
1
        s >>= more_bit;
538
539
        // same as before
540
1
        if (need_bit) {
541
0
            if constexpr (sizeof(T) <= 4) {
542
0
                *output |= static_cast<T>(static_cast<uint32_t>(
543
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
            } else {
545
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
                                          ((static_cast<U>(1) << need_bit) - 1));
547
            }
548
0
            output++;
549
0
            valid_bit -= need_bit;
550
0
        }
551
552
1
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
3
        for (int j = 0; j < num; j++) {
556
2
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
2
            output++;
558
2
        }
559
1
    }
560
9
}
Unexecuted instantiation: _ZN5doris10ForDecoderIiE19bit_unpack_optimizeInEEvPKhhiPi
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeIlEEvPKhhiPl
Line
Count
Source
438
12.3k
                                        T* output) {
439
12.3k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
12.3k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
12.3k
    constexpr int u_size = sizeof(U);                   // Size of U
442
12.3k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
12.3k
    int valid_bit = 0;                                  // How many valid bits
444
12.3k
    int need_bit = 0;                                   // still need
445
12.3k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
12.3k
    int full_batch_size =
447
12.3k
            cast_set<int>((input_size >> u_size_shift)
448
12.3k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
12.3k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
12.3k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
12.3k
    T output_mask;
455
12.3k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
12.3k
    } else {
458
12.3k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
12.3k
    }
460
461
12.3k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
278k
    for (int i = 0; i < full_batch_size; i += u_size) {
464
266k
        s = 0;
465
466
266k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
266k
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
266k
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
5.19k
            valid_bit -= more_bit;
474
5.19k
            s >>= more_bit;
475
5.19k
        }
476
477
266k
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
207k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
207k
            auto shifted = s >> (valid_bit - need_bit);
484
207k
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
207k
            } else {
488
207k
                *output |= static_cast<T>(masked_result);
489
207k
            }
490
207k
            output++;
491
207k
            valid_bit -= need_bit;
492
207k
        }
493
494
266k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
266k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
1.07M
        for (int j = 0; j < num; j++) {
502
809k
            *output =
503
809k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
809k
            output++;
505
809k
        }
506
507
266k
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
212k
            } else {
516
212k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
212k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
212k
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
212k
            need_bit = bit_width - remainder;
521
212k
        } else {
522
53.9k
            need_bit = 0;
523
53.9k
        }
524
266k
    }
525
526
    // remainder
527
12.3k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
35.0k
        for (int i = 0; i < tail_count; i++) {
530
28.0k
            s <<= 8;
531
28.0k
            s |= input[full_batch_size + i];
532
28.0k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
6.98k
        valid_bit = (tail_count << 3) - more_bit;
537
6.98k
        s >>= more_bit;
538
539
        // same as before
540
6.98k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
5.14k
            } else {
545
5.14k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
5.14k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
5.14k
            }
548
5.14k
            output++;
549
5.14k
            valid_bit -= need_bit;
550
5.14k
        }
551
552
6.98k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
30.6k
        for (int j = 0; j < num; j++) {
556
23.6k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
23.6k
            output++;
558
23.6k
        }
559
6.98k
    }
560
12.3k
}
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeInEEvPKhhiPl
Line
Count
Source
438
12.1k
                                        T* output) {
439
12.1k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
12.1k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
12.1k
    constexpr int u_size = sizeof(U);                   // Size of U
442
12.1k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
12.1k
    int valid_bit = 0;                                  // How many valid bits
444
12.1k
    int need_bit = 0;                                   // still need
445
12.1k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
12.1k
    int full_batch_size =
447
12.1k
            cast_set<int>((input_size >> u_size_shift)
448
12.1k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
12.1k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
12.1k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
12.1k
    T output_mask;
455
12.1k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
12.1k
    } else {
458
12.1k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
12.1k
    }
460
461
12.1k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
403k
    for (int i = 0; i < full_batch_size; i += u_size) {
464
391k
        s = 0;
465
466
391k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
391k
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
391k
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
4.55k
            valid_bit -= more_bit;
474
4.55k
            s >>= more_bit;
475
4.55k
        }
476
477
391k
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
367k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
367k
            auto shifted = s >> (valid_bit - need_bit);
484
367k
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
367k
            } else {
488
367k
                *output |= static_cast<T>(masked_result);
489
367k
            }
490
367k
            output++;
491
367k
            valid_bit -= need_bit;
492
367k
        }
493
494
391k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
391k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
1.05M
        for (int j = 0; j < num; j++) {
502
663k
            *output =
503
663k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
663k
            output++;
505
663k
        }
506
507
391k
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
374k
            } else {
516
374k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
374k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
374k
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
374k
            need_bit = bit_width - remainder;
521
374k
        } else {
522
16.9k
            need_bit = 0;
523
16.9k
        }
524
391k
    }
525
526
    // remainder
527
12.1k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
68.5k
        for (int i = 0; i < tail_count; i++) {
530
60.9k
            s <<= 8;
531
60.9k
            s |= input[full_batch_size + i];
532
60.9k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
7.60k
        valid_bit = (tail_count << 3) - more_bit;
537
7.60k
        s >>= more_bit;
538
539
        // same as before
540
7.60k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
7.31k
            } else {
545
7.31k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
7.31k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
7.31k
            }
548
7.31k
            output++;
549
7.31k
            valid_bit -= need_bit;
550
7.31k
        }
551
552
7.60k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
14.1k
        for (int j = 0; j < num; j++) {
556
6.51k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
6.51k
            output++;
558
6.51k
        }
559
7.60k
    }
560
12.1k
}
_ZN5doris10ForDecoderInE19bit_unpack_optimizeIlEEvPKhhiPn
Line
Count
Source
438
8.28k
                                        T* output) {
439
8.28k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
8.28k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
8.28k
    constexpr int u_size = sizeof(U);                   // Size of U
442
8.28k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
8.28k
    int valid_bit = 0;                                  // How many valid bits
444
8.28k
    int need_bit = 0;                                   // still need
445
8.28k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
8.28k
    int full_batch_size =
447
8.28k
            cast_set<int>((input_size >> u_size_shift)
448
8.28k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
8.28k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
8.28k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
8.28k
    T output_mask;
455
8.28k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
8.28k
    } else {
458
8.28k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
8.28k
    }
460
461
8.28k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
274k
    for (int i = 0; i < full_batch_size; i += u_size) {
464
266k
        s = 0;
465
466
266k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
266k
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
266k
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
1.12k
            valid_bit -= more_bit;
474
1.12k
            s >>= more_bit;
475
1.12k
        }
476
477
266k
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
207k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
207k
            auto shifted = s >> (valid_bit - need_bit);
484
207k
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
207k
            } else {
488
207k
                *output |= static_cast<T>(masked_result);
489
207k
            }
490
207k
            output++;
491
207k
            valid_bit -= need_bit;
492
207k
        }
493
494
266k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
266k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
1.07M
        for (int j = 0; j < num; j++) {
502
808k
            *output =
503
808k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
808k
            output++;
505
808k
        }
506
507
266k
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
212k
            } else {
516
212k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
212k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
212k
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
212k
            need_bit = bit_width - remainder;
521
212k
        } else {
522
53.9k
            need_bit = 0;
523
53.9k
        }
524
266k
    }
525
526
    // remainder
527
8.28k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
35.2k
        for (int i = 0; i < tail_count; i++) {
530
28.1k
            s <<= 8;
531
28.1k
            s |= input[full_batch_size + i];
532
28.1k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
7.04k
        valid_bit = (tail_count << 3) - more_bit;
537
7.04k
        s >>= more_bit;
538
539
        // same as before
540
7.04k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
5.14k
            } else {
545
5.14k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
5.14k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
5.14k
            }
548
5.14k
            output++;
549
5.14k
            valid_bit -= need_bit;
550
5.14k
        }
551
552
7.04k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
30.7k
        for (int j = 0; j < num; j++) {
556
23.7k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
23.7k
            output++;
558
23.7k
        }
559
7.04k
    }
560
8.28k
}
_ZN5doris10ForDecoderInE19bit_unpack_optimizeInEEvPKhhiPn
Line
Count
Source
438
48.5k
                                        T* output) {
439
48.5k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
48.5k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
48.5k
    constexpr int u_size = sizeof(U);                   // Size of U
442
48.5k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
48.5k
    int valid_bit = 0;                                  // How many valid bits
444
48.5k
    int need_bit = 0;                                   // still need
445
48.5k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
48.5k
    int full_batch_size =
447
48.5k
            cast_set<int>((input_size >> u_size_shift)
448
48.5k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
48.5k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
48.5k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
48.5k
    T output_mask;
455
48.5k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
48.5k
    } else {
458
48.5k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
48.5k
    }
460
461
48.5k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
3.52M
    for (int i = 0; i < full_batch_size; i += u_size) {
464
3.47M
        s = 0;
465
466
3.47M
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
3.47M
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
3.47M
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
10.8k
            valid_bit -= more_bit;
474
10.8k
            s >>= more_bit;
475
10.8k
        }
476
477
3.47M
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
3.30M
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
3.30M
            auto shifted = s >> (valid_bit - need_bit);
484
3.30M
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
3.30M
            } else {
488
3.30M
                *output |= static_cast<T>(masked_result);
489
3.30M
            }
490
3.30M
            output++;
491
3.30M
            valid_bit -= need_bit;
492
3.30M
        }
493
494
3.47M
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
3.47M
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
5.30M
        for (int j = 0; j < num; j++) {
502
1.83M
            *output =
503
1.83M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
1.83M
            output++;
505
1.83M
        }
506
507
3.47M
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
3.34M
            } else {
516
3.34M
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
3.34M
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
3.34M
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
3.34M
            need_bit = bit_width - remainder;
521
3.34M
        } else {
522
132k
            need_bit = 0;
523
132k
        }
524
3.47M
    }
525
526
    // remainder
527
48.5k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
338k
        for (int i = 0; i < tail_count; i++) {
530
300k
            s <<= 8;
531
300k
            s |= input[full_batch_size + i];
532
300k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
37.6k
        valid_bit = (tail_count << 3) - more_bit;
537
37.6k
        s >>= more_bit;
538
539
        // same as before
540
37.6k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
36.4k
            } else {
545
36.4k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
36.4k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
36.4k
            }
548
36.4k
            output++;
549
36.4k
            valid_bit -= need_bit;
550
36.4k
        }
551
552
37.6k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
50.9k
        for (int j = 0; j < num; j++) {
556
13.3k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
13.3k
            output++;
558
13.3k
        }
559
37.6k
    }
560
48.5k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeIlEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeInEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeIlEEvPKhhiPt
Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeInEEvPKhhiPt
_ZN5doris10ForDecoderIjE19bit_unpack_optimizeIlEEvPKhhiPj
Line
Count
Source
438
5
                                        T* output) {
439
5
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
5
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
5
    constexpr int u_size = sizeof(U);                   // Size of U
442
5
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
5
    int valid_bit = 0;                                  // How many valid bits
444
5
    int need_bit = 0;                                   // still need
445
5
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
5
    int full_batch_size =
447
5
            cast_set<int>((input_size >> u_size_shift)
448
5
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
5
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
5
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
5
    T output_mask;
455
5
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
5
    } else {
458
5
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
5
    }
460
461
5
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
15
    for (int i = 0; i < full_batch_size; i += u_size) {
464
10
        s = 0;
465
466
10
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
10
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
10
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
5
            valid_bit -= more_bit;
474
5
            s >>= more_bit;
475
5
        }
476
477
10
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
0
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
0
            auto shifted = s >> (valid_bit - need_bit);
484
0
            auto masked_result = shifted & mask;
485
0
            if constexpr (sizeof(T) <= 4) {
486
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
            } else {
488
                *output |= static_cast<T>(masked_result);
489
            }
490
0
            output++;
491
0
            valid_bit -= need_bit;
492
0
        }
493
494
10
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
10
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
650
        for (int j = 0; j < num; j++) {
502
640
            *output =
503
640
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
640
            output++;
505
640
        }
506
507
10
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
0
            if constexpr (sizeof(T) <= 4) {
512
0
                auto masked_value = static_cast<T>(
513
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
            } else {
516
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
0
            need_bit = bit_width - remainder;
521
10
        } else {
522
10
            need_bit = 0;
523
10
        }
524
10
    }
525
526
    // remainder
527
5
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
0
        for (int i = 0; i < tail_count; i++) {
530
0
            s <<= 8;
531
0
            s |= input[full_batch_size + i];
532
0
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
0
        valid_bit = (tail_count << 3) - more_bit;
537
0
        s >>= more_bit;
538
539
        // same as before
540
0
        if (need_bit) {
541
0
            if constexpr (sizeof(T) <= 4) {
542
0
                *output |= static_cast<T>(static_cast<uint32_t>(
543
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
            } else {
545
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
                                          ((static_cast<U>(1) << need_bit) - 1));
547
            }
548
0
            output++;
549
0
            valid_bit -= need_bit;
550
0
        }
551
552
0
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
0
        for (int j = 0; j < num; j++) {
556
0
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
0
            output++;
558
0
        }
559
0
    }
560
5
}
Unexecuted instantiation: _ZN5doris10ForDecoderIjE19bit_unpack_optimizeInEEvPKhhiPj
Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeIlEEvPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeInEEvPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeIlEEvPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeInEEvPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeIlEEvPKhhiPo
Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeInEEvPKhhiPo
561
562
// The reverse of bit_pack method, get original integer data list from packed bits
563
// param[in] input: the packed bits need to unpack
564
// param[in] in_num: the integer number in packed bits
565
// param[in] bit_width: how many bit we used to store each integer data
566
// param[out] output: the original integer data list
567
template <typename T>
568
81.3k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
81.3k
    if (bit_width <= 32) {
574
20.6k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
60.6k
    } else {
576
60.6k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
60.6k
    }
578
81.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE10bit_unpackEPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE10bit_unpackEPKhhiPs
_ZN5doris10ForDecoderIiE10bit_unpackEPKhhiPi
Line
Count
Source
568
9
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
9
    if (bit_width <= 32) {
574
9
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
9
    } else {
576
0
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
0
    }
578
9
}
_ZN5doris10ForDecoderIlE10bit_unpackEPKhhiPl
Line
Count
Source
568
24.4k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
24.4k
    if (bit_width <= 32) {
574
12.3k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
12.3k
    } else {
576
12.1k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
12.1k
    }
578
24.4k
}
_ZN5doris10ForDecoderInE10bit_unpackEPKhhiPn
Line
Count
Source
568
56.8k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
56.8k
    if (bit_width <= 32) {
574
8.28k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
48.5k
    } else {
576
48.5k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
48.5k
    }
578
56.8k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10bit_unpackEPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE10bit_unpackEPKhhiPt
_ZN5doris10ForDecoderIjE10bit_unpackEPKhhiPj
Line
Count
Source
568
5
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
5
    if (bit_width <= 32) {
574
5
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
5
    } else {
576
0
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
0
    }
578
5
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE10bit_unpackEPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10bit_unpackEPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE10bit_unpackEPKhhiPo
579
580
template <typename T>
581
4.17M
void ForDecoder<T>::decode_current_frame(T* output) {
582
4.17M
    uint32_t frame_index = _current_index / _max_frame_size;
583
4.17M
    if (frame_index == _current_decoded_frame) {
584
4.12M
        return; // current frame already decoded
585
4.12M
    }
586
48.9k
    _current_decoded_frame = frame_index;
587
48.9k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
48.9k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
48.9k
    T min = 0;
591
48.9k
    uint32_t delta_offset = 0;
592
48.9k
    if constexpr (sizeof(T) == 16) {
593
24.4k
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
24.4k
        delta_offset = base_offset + 16;
595
24.4k
    } else if constexpr (sizeof(T) == 8) {
596
24.4k
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
24.4k
        delta_offset = base_offset + 8;
598
24.4k
    } else {
599
14
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
14
        delta_offset = base_offset + 4;
601
14
    }
602
603
48.9k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
48.9k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
48.9k
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
48.9k
    } else {
609
48.9k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
48.9k
        std::vector<T> delta_values(current_frame_size);
611
48.9k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
48.9k
        if (is_ascending) {
613
451
            T pre_value = min;
614
3.87k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
3.42k
                T value = delta_values[i] + pre_value;
616
3.42k
                output[i] = value;
617
3.42k
                pre_value = value;
618
3.42k
            }
619
48.4k
        } else {
620
4.22M
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
4.17M
                output[i] = delta_values[i] + min;
622
4.17M
            }
623
48.4k
        }
624
48.9k
    }
625
48.9k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE20decode_current_frameEPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE20decode_current_frameEPs
_ZN5doris10ForDecoderIiE20decode_current_frameEPi
Line
Count
Source
581
10
void ForDecoder<T>::decode_current_frame(T* output) {
582
10
    uint32_t frame_index = _current_index / _max_frame_size;
583
10
    if (frame_index == _current_decoded_frame) {
584
1
        return; // current frame already decoded
585
1
    }
586
9
    _current_decoded_frame = frame_index;
587
9
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
9
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
9
    T min = 0;
591
9
    uint32_t delta_offset = 0;
592
    if constexpr (sizeof(T) == 16) {
593
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
        delta_offset = base_offset + 16;
595
    } else if constexpr (sizeof(T) == 8) {
596
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
        delta_offset = base_offset + 8;
598
9
    } else {
599
9
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
9
        delta_offset = base_offset + 4;
601
9
    }
602
603
9
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
9
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
9
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
9
    } else {
609
9
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
9
        std::vector<T> delta_values(current_frame_size);
611
9
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
9
        if (is_ascending) {
613
9
            T pre_value = min;
614
780
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
771
                T value = delta_values[i] + pre_value;
616
771
                output[i] = value;
617
771
                pre_value = value;
618
771
            }
619
9
        } else {
620
0
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
0
                output[i] = delta_values[i] + min;
622
0
            }
623
0
        }
624
9
    }
625
9
}
_ZN5doris10ForDecoderIlE20decode_current_frameEPl
Line
Count
Source
581
2.08M
void ForDecoder<T>::decode_current_frame(T* output) {
582
2.08M
    uint32_t frame_index = _current_index / _max_frame_size;
583
2.08M
    if (frame_index == _current_decoded_frame) {
584
2.06M
        return; // current frame already decoded
585
2.06M
    }
586
24.4k
    _current_decoded_frame = frame_index;
587
24.4k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
24.4k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
24.4k
    T min = 0;
591
24.4k
    uint32_t delta_offset = 0;
592
    if constexpr (sizeof(T) == 16) {
593
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
        delta_offset = base_offset + 16;
595
24.4k
    } else if constexpr (sizeof(T) == 8) {
596
24.4k
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
24.4k
        delta_offset = base_offset + 8;
598
    } else {
599
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
        delta_offset = base_offset + 4;
601
    }
602
603
24.4k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
24.4k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
24.4k
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
24.4k
    } else {
609
24.4k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
24.4k
        std::vector<T> delta_values(current_frame_size);
611
24.4k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
24.4k
        if (is_ascending) {
613
223
            T pre_value = min;
614
1.89k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
1.67k
                T value = delta_values[i] + pre_value;
616
1.67k
                output[i] = value;
617
1.67k
                pre_value = value;
618
1.67k
            }
619
24.2k
        } else {
620
2.11M
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
2.08M
                output[i] = delta_values[i] + min;
622
2.08M
            }
623
24.2k
        }
624
24.4k
    }
625
24.4k
}
_ZN5doris10ForDecoderInE20decode_current_frameEPn
Line
Count
Source
581
2.08M
void ForDecoder<T>::decode_current_frame(T* output) {
582
2.08M
    uint32_t frame_index = _current_index / _max_frame_size;
583
2.08M
    if (frame_index == _current_decoded_frame) {
584
2.06M
        return; // current frame already decoded
585
2.06M
    }
586
24.4k
    _current_decoded_frame = frame_index;
587
24.4k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
24.4k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
24.4k
    T min = 0;
591
24.4k
    uint32_t delta_offset = 0;
592
24.4k
    if constexpr (sizeof(T) == 16) {
593
24.4k
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
24.4k
        delta_offset = base_offset + 16;
595
    } else if constexpr (sizeof(T) == 8) {
596
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
        delta_offset = base_offset + 8;
598
    } else {
599
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
        delta_offset = base_offset + 4;
601
    }
602
603
24.4k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
24.4k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
24.4k
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
24.4k
    } else {
609
24.4k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
24.4k
        std::vector<T> delta_values(current_frame_size);
611
24.4k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
24.4k
        if (is_ascending) {
613
214
            T pre_value = min;
614
552
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
338
                T value = delta_values[i] + pre_value;
616
338
                output[i] = value;
617
338
                pre_value = value;
618
338
            }
619
24.2k
        } else {
620
2.11M
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
2.08M
                output[i] = delta_values[i] + min;
622
2.08M
            }
623
24.2k
        }
624
24.4k
    }
625
24.4k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE20decode_current_frameEPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE20decode_current_frameEPt
_ZN5doris10ForDecoderIjE20decode_current_frameEPj
Line
Count
Source
581
5
void ForDecoder<T>::decode_current_frame(T* output) {
582
5
    uint32_t frame_index = _current_index / _max_frame_size;
583
5
    if (frame_index == _current_decoded_frame) {
584
0
        return; // current frame already decoded
585
0
    }
586
5
    _current_decoded_frame = frame_index;
587
5
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
5
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
5
    T min = 0;
591
5
    uint32_t delta_offset = 0;
592
    if constexpr (sizeof(T) == 16) {
593
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
        delta_offset = base_offset + 16;
595
    } else if constexpr (sizeof(T) == 8) {
596
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
        delta_offset = base_offset + 8;
598
5
    } else {
599
5
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
5
        delta_offset = base_offset + 4;
601
5
    }
602
603
5
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
5
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
5
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
5
    } else {
609
5
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
5
        std::vector<T> delta_values(current_frame_size);
611
5
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
5
        if (is_ascending) {
613
5
            T pre_value = min;
614
645
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
640
                T value = delta_values[i] + pre_value;
616
640
                output[i] = value;
617
640
                pre_value = value;
618
640
            }
619
5
        } else {
620
0
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
0
                output[i] = delta_values[i] + min;
622
0
            }
623
0
        }
624
5
    }
625
5
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE20decode_current_frameEPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE20decode_current_frameEPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE20decode_current_frameEPo
626
627
template <typename T>
628
12
T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) {
629
12
    uint32_t min_offset = _frame_offsets[frame_index];
630
12
    T min = 0;
631
12
    if constexpr (sizeof(T) == 16) {
632
0
        min = static_cast<T>(decode_fixed128_le(_buffer + min_offset));
633
12
    } else if constexpr (sizeof(T) == 8) {
634
12
        min = static_cast<T>(decode_fixed64_le(_buffer + min_offset));
635
12
    } else {
636
0
        min = static_cast<T>(decode_fixed32_le(_buffer + min_offset));
637
0
    }
638
12
    return min;
639
12
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIsE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIiE22decode_frame_min_valueEj
_ZN5doris10ForDecoderIlE22decode_frame_min_valueEj
Line
Count
Source
628
12
T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) {
629
12
    uint32_t min_offset = _frame_offsets[frame_index];
630
12
    T min = 0;
631
    if constexpr (sizeof(T) == 16) {
632
        min = static_cast<T>(decode_fixed128_le(_buffer + min_offset));
633
12
    } else if constexpr (sizeof(T) == 8) {
634
12
        min = static_cast<T>(decode_fixed64_le(_buffer + min_offset));
635
    } else {
636
        min = static_cast<T>(decode_fixed32_le(_buffer + min_offset));
637
    }
638
12
    return min;
639
12
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIhE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderItE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIjE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderImE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIoE22decode_frame_min_valueEj
640
641
template <typename T>
642
4.17M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
4.17M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
4.17M
    _current_index += count;
645
4.17M
    val += count;
646
4.17M
    return val;
647
4.17M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE10copy_valueEPam
Unexecuted instantiation: _ZN5doris10ForDecoderIsE10copy_valueEPsm
_ZN5doris10ForDecoderIiE10copy_valueEPim
Line
Count
Source
642
8
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
8
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
8
    _current_index += count;
645
8
    val += count;
646
8
    return val;
647
8
}
_ZN5doris10ForDecoderIlE10copy_valueEPlm
Line
Count
Source
642
2.08M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
2.08M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
2.08M
    _current_index += count;
645
2.08M
    val += count;
646
2.08M
    return val;
647
2.08M
}
_ZN5doris10ForDecoderInE10copy_valueEPnm
Line
Count
Source
642
2.08M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
2.08M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
2.08M
    _current_index += count;
645
2.08M
    val += count;
646
2.08M
    return val;
647
2.08M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10copy_valueEPhm
Unexecuted instantiation: _ZN5doris10ForDecoderItE10copy_valueEPtm
_ZN5doris10ForDecoderIjE10copy_valueEPjm
Line
Count
Source
642
3
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
3
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
3
    _current_index += count;
645
3
    val += count;
646
3
    return val;
647
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE10copy_valueEPmm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10copy_valueEPS1_m
Unexecuted instantiation: _ZN5doris10ForDecoderIoE10copy_valueEPom
648
649
template <typename T>
650
4.17M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
4.17M
    if (_current_index + count > _values_num) {
652
1
        return false;
653
1
    }
654
655
4.17M
    decode_current_frame(_out_buffer.data());
656
657
4.17M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
4.16M
        copy_value(val, count);
659
4.16M
        return true;
660
4.16M
    }
661
662
    // 1. padding one frame
663
16.3k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
16.3k
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
16.3k
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
16.4k
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
7
        decode_current_frame(val);
671
7
        _current_index += _max_frame_size;
672
7
        val += _max_frame_size;
673
7
    }
674
675
    // 3. process remaining value
676
16.3k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
16.3k
    if (remaining_num > 0) {
678
4
        decode_current_frame(_out_buffer.data());
679
4
        val = copy_value(val, remaining_num);
680
4
    }
681
682
16.3k
    return true;
683
4.17M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE9get_batchEPam
Unexecuted instantiation: _ZN5doris10ForDecoderIsE9get_batchEPsm
_ZN5doris10ForDecoderIiE9get_batchEPim
Line
Count
Source
650
8
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
8
    if (_current_index + count > _values_num) {
652
1
        return false;
653
1
    }
654
655
7
    decode_current_frame(_out_buffer.data());
656
657
7
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
4
        copy_value(val, count);
659
4
        return true;
660
4
    }
661
662
    // 1. padding one frame
663
3
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
3
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
3
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
5
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
2
        decode_current_frame(val);
671
2
        _current_index += _max_frame_size;
672
2
        val += _max_frame_size;
673
2
    }
674
675
    // 3. process remaining value
676
3
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
3
    if (remaining_num > 0) {
678
1
        decode_current_frame(_out_buffer.data());
679
1
        val = copy_value(val, remaining_num);
680
1
    }
681
682
3
    return true;
683
7
}
_ZN5doris10ForDecoderIlE9get_batchEPlm
Line
Count
Source
650
2.08M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
2.08M
    if (_current_index + count > _values_num) {
652
0
        return false;
653
0
    }
654
655
2.08M
    decode_current_frame(_out_buffer.data());
656
657
2.08M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
2.08M
        copy_value(val, count);
659
2.08M
        return true;
660
2.08M
    }
661
662
    // 1. padding one frame
663
8.19k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
8.19k
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
8.19k
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
8.19k
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
3
        decode_current_frame(val);
671
3
        _current_index += _max_frame_size;
672
3
        val += _max_frame_size;
673
3
    }
674
675
    // 3. process remaining value
676
8.19k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
8.19k
    if (remaining_num > 0) {
678
3
        decode_current_frame(_out_buffer.data());
679
3
        val = copy_value(val, remaining_num);
680
3
    }
681
682
8.19k
    return true;
683
2.08M
}
_ZN5doris10ForDecoderInE9get_batchEPnm
Line
Count
Source
650
2.08M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
2.08M
    if (_current_index + count > _values_num) {
652
0
        return false;
653
0
    }
654
655
2.08M
    decode_current_frame(_out_buffer.data());
656
657
2.08M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
2.08M
        copy_value(val, count);
659
2.08M
        return true;
660
2.08M
    }
661
662
    // 1. padding one frame
663
8.19k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
8.19k
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
8.19k
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
8.19k
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
0
        decode_current_frame(val);
671
0
        _current_index += _max_frame_size;
672
0
        val += _max_frame_size;
673
0
    }
674
675
    // 3. process remaining value
676
8.19k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
8.19k
    if (remaining_num > 0) {
678
0
        decode_current_frame(_out_buffer.data());
679
0
        val = copy_value(val, remaining_num);
680
0
    }
681
682
8.19k
    return true;
683
2.08M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE9get_batchEPhm
Unexecuted instantiation: _ZN5doris10ForDecoderItE9get_batchEPtm
_ZN5doris10ForDecoderIjE9get_batchEPjm
Line
Count
Source
650
3
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
3
    if (_current_index + count > _values_num) {
652
0
        return false;
653
0
    }
654
655
3
    decode_current_frame(_out_buffer.data());
656
657
3
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
0
        copy_value(val, count);
659
0
        return true;
660
0
    }
661
662
    // 1. padding one frame
663
3
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
3
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
3
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
5
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
2
        decode_current_frame(val);
671
2
        _current_index += _max_frame_size;
672
2
        val += _max_frame_size;
673
2
    }
674
675
    // 3. process remaining value
676
3
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
3
    if (remaining_num > 0) {
678
0
        decode_current_frame(_out_buffer.data());
679
0
        val = copy_value(val, remaining_num);
680
0
    }
681
682
3
    return true;
683
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE9get_batchEPmm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE9get_batchEPS1_m
Unexecuted instantiation: _ZN5doris10ForDecoderIoE9get_batchEPom
684
685
template <typename T>
686
3
bool ForDecoder<T>::skip(int32_t skip_num) {
687
3
    if (_current_index + skip_num >= _values_num) {
688
0
        return false;
689
0
    }
690
3
    _current_index = _current_index + skip_num;
691
3
    return true;
692
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIsE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIiE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIlE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderInE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderItE4skipEi
_ZN5doris10ForDecoderIjE4skipEi
Line
Count
Source
686
3
bool ForDecoder<T>::skip(int32_t skip_num) {
687
3
    if (_current_index + skip_num >= _values_num) {
688
0
        return false;
689
0
    }
690
3
    _current_index = _current_index + skip_num;
691
3
    return true;
692
3
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIoE4skipEi
693
694
template <typename T>
695
6
uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) {
696
    // first of all, find the first frame >= target
697
6
    uint32_t left = 0;
698
6
    uint32_t right = _frame_count;
699
18
    while (left < right) {
700
12
        uint32_t mid = left + (right - left) / 2;
701
12
        T midValue = decode_frame_min_value(mid);
702
12
        if (midValue < target) {
703
6
            left = mid + 1;
704
6
        } else {
705
6
            right = mid;
706
6
        }
707
12
    }
708
    // after loop, left is the first frame >= target
709
6
    if (left == 0) {
710
        // all frames are >= target, not found
711
2
        return _frame_count;
712
2
    }
713
    // otherwise previous frame is the last frame < target
714
4
    return left - 1;
715
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE28seek_last_frame_before_valueEa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE28seek_last_frame_before_valueEs
Unexecuted instantiation: _ZN5doris10ForDecoderIiE28seek_last_frame_before_valueEi
_ZN5doris10ForDecoderIlE28seek_last_frame_before_valueEl
Line
Count
Source
695
6
uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) {
696
    // first of all, find the first frame >= target
697
6
    uint32_t left = 0;
698
6
    uint32_t right = _frame_count;
699
18
    while (left < right) {
700
12
        uint32_t mid = left + (right - left) / 2;
701
12
        T midValue = decode_frame_min_value(mid);
702
12
        if (midValue < target) {
703
6
            left = mid + 1;
704
6
        } else {
705
6
            right = mid;
706
6
        }
707
12
    }
708
    // after loop, left is the first frame >= target
709
6
    if (left == 0) {
710
        // all frames are >= target, not found
711
2
        return _frame_count;
712
2
    }
713
    // otherwise previous frame is the last frame < target
714
4
    return left - 1;
715
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE28seek_last_frame_before_valueEn
Unexecuted instantiation: _ZN5doris10ForDecoderIhE28seek_last_frame_before_valueEh
Unexecuted instantiation: _ZN5doris10ForDecoderItE28seek_last_frame_before_valueEt
Unexecuted instantiation: _ZN5doris10ForDecoderIjE28seek_last_frame_before_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderImE28seek_last_frame_before_valueEm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE28seek_last_frame_before_valueES1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE28seek_last_frame_before_valueEo
716
717
template <typename T>
718
bool ForDecoder<T>::seek_lower_bound_inside_frame(uint32_t frame_index, T target,
719
4
                                                  bool* exact_match) {
720
4
    _current_index = frame_index * _max_frame_size;
721
4
    decode_current_frame(_out_buffer.data());
722
4
    auto end = _out_buffer.begin() + frame_size(frame_index);
723
4
    auto pos = std::lower_bound(_out_buffer.begin(), end, target);
724
4
    if (pos != end) { // found in this frame
725
2
        auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos));
726
2
        *exact_match = _out_buffer[pos_in_frame] == target;
727
2
        _current_index += pos_in_frame;
728
2
        return true;
729
2
    }
730
2
    return false;
731
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE29seek_lower_bound_inside_frameEjaPb
Unexecuted instantiation: _ZN5doris10ForDecoderIsE29seek_lower_bound_inside_frameEjsPb
Unexecuted instantiation: _ZN5doris10ForDecoderIiE29seek_lower_bound_inside_frameEjiPb
_ZN5doris10ForDecoderIlE29seek_lower_bound_inside_frameEjlPb
Line
Count
Source
719
4
                                                  bool* exact_match) {
720
4
    _current_index = frame_index * _max_frame_size;
721
4
    decode_current_frame(_out_buffer.data());
722
4
    auto end = _out_buffer.begin() + frame_size(frame_index);
723
4
    auto pos = std::lower_bound(_out_buffer.begin(), end, target);
724
4
    if (pos != end) { // found in this frame
725
2
        auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos));
726
2
        *exact_match = _out_buffer[pos_in_frame] == target;
727
2
        _current_index += pos_in_frame;
728
2
        return true;
729
2
    }
730
2
    return false;
731
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE29seek_lower_bound_inside_frameEjnPb
Unexecuted instantiation: _ZN5doris10ForDecoderIhE29seek_lower_bound_inside_frameEjhPb
Unexecuted instantiation: _ZN5doris10ForDecoderItE29seek_lower_bound_inside_frameEjtPb
Unexecuted instantiation: _ZN5doris10ForDecoderIjE29seek_lower_bound_inside_frameEjjPb
Unexecuted instantiation: _ZN5doris10ForDecoderImE29seek_lower_bound_inside_frameEjmPb
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE29seek_lower_bound_inside_frameEjS1_Pb
Unexecuted instantiation: _ZN5doris10ForDecoderIoE29seek_lower_bound_inside_frameEjoPb
732
733
template <typename T>
734
6
bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) {
735
6
    T target = *reinterpret_cast<const T*>(value);
736
6
    uint32_t frame_to_search = seek_last_frame_before_value(target);
737
6
    if (frame_to_search == _frame_count) {
738
        // all frames are >= target, the searched value must the be first value
739
2
        _current_index = 0;
740
2
        decode_current_frame(_out_buffer.data());
741
2
        *exact_match = _out_buffer[0] == target;
742
2
        return true;
743
2
    }
744
    // binary search inside the last frame < target
745
4
    bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match);
746
    // if not found, all values in the last frame are less than target.
747
    // then the searched value must be the first value of the next frame.
748
4
    if (!found && frame_to_search < _frame_count - 1) {
749
1
        _current_index = (frame_to_search + 1) * _max_frame_size;
750
1
        decode_current_frame(_out_buffer.data());
751
1
        *exact_match = _out_buffer[0] == target;
752
1
        return true;
753
1
    }
754
3
    return found;
755
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIsE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIiE22seek_at_or_after_valueEPKvPb
_ZN5doris10ForDecoderIlE22seek_at_or_after_valueEPKvPb
Line
Count
Source
734
6
bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) {
735
6
    T target = *reinterpret_cast<const T*>(value);
736
6
    uint32_t frame_to_search = seek_last_frame_before_value(target);
737
6
    if (frame_to_search == _frame_count) {
738
        // all frames are >= target, the searched value must the be first value
739
2
        _current_index = 0;
740
2
        decode_current_frame(_out_buffer.data());
741
2
        *exact_match = _out_buffer[0] == target;
742
2
        return true;
743
2
    }
744
    // binary search inside the last frame < target
745
4
    bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match);
746
    // if not found, all values in the last frame are less than target.
747
    // then the searched value must be the first value of the next frame.
748
4
    if (!found && frame_to_search < _frame_count - 1) {
749
1
        _current_index = (frame_to_search + 1) * _max_frame_size;
750
1
        decode_current_frame(_out_buffer.data());
751
1
        *exact_match = _out_buffer[0] == target;
752
1
        return true;
753
1
    }
754
3
    return found;
755
4
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIhE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderItE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIjE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderImE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIoE22seek_at_or_after_valueEPKvPb
756
757
template class ForEncoder<int8_t>;
758
template class ForEncoder<int16_t>;
759
template class ForEncoder<int32_t>;
760
template class ForEncoder<int64_t>;
761
template class ForEncoder<int128_t>;
762
template class ForEncoder<uint8_t>;
763
template class ForEncoder<uint16_t>;
764
template class ForEncoder<uint32_t>;
765
template class ForEncoder<uint64_t>;
766
template class ForEncoder<uint24_t>;
767
template class ForEncoder<uint128_t>;
768
769
template class ForDecoder<int8_t>;
770
template class ForDecoder<int16_t>;
771
template class ForDecoder<int32_t>;
772
template class ForDecoder<int64_t>;
773
template class ForDecoder<int128_t>;
774
template class ForDecoder<uint8_t>;
775
template class ForDecoder<uint16_t>;
776
template class ForDecoder<uint32_t>;
777
template class ForDecoder<uint64_t>;
778
template class ForDecoder<uint24_t>;
779
template class ForDecoder<uint128_t>;
780
#include "common/compile_check_end.h"
781
} // namespace doris