Coverage Report

Created: 2026-03-19 11:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/frame_of_reference_coding.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/frame_of_reference_coding.h"
19
20
#include <glog/logging.h>
21
#include <sys/types.h>
22
23
#include <algorithm>
24
#include <cstring>
25
#include <iostream>
26
#include <iterator>
27
#include <limits>
28
29
#include "common/cast_set.h"
30
#include "exec/common/endian.h"
31
#include "util/bit_util.h"
32
#include "util/coding.h"
33
34
namespace doris {
35
#include "common/compile_check_begin.h"
36
37
template <typename T>
38
8.35M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
8.35M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
8.35M
    _buffered_values_num += count;
41
8.35M
    p_data += count;
42
8.35M
    return p_data;
43
8.35M
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10copy_valueEPKam
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10copy_valueEPKsm
_ZN5doris10ForEncoderIiE10copy_valueEPKim
Line
Count
Source
38
16
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
16
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
16
    _buffered_values_num += count;
41
16
    p_data += count;
42
16
    return p_data;
43
16
}
_ZN5doris10ForEncoderIlE10copy_valueEPKlm
Line
Count
Source
38
4.17M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
4.17M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
4.17M
    _buffered_values_num += count;
41
4.17M
    p_data += count;
42
4.17M
    return p_data;
43
4.17M
}
_ZN5doris10ForEncoderInE10copy_valueEPKnm
Line
Count
Source
38
4.17M
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
4.17M
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
4.17M
    _buffered_values_num += count;
41
4.17M
    p_data += count;
42
4.17M
    return p_data;
43
4.17M
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10copy_valueEPKhm
Unexecuted instantiation: _ZN5doris10ForEncoderItE10copy_valueEPKtm
_ZN5doris10ForEncoderIjE10copy_valueEPKjm
Line
Count
Source
38
6
const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) {
39
6
    memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T));
40
6
    _buffered_values_num += count;
41
6
    p_data += count;
42
6
    return p_data;
43
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE10copy_valueEPKmm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10copy_valueEPKS1_m
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10copy_valueEPKom
44
45
template <typename T>
46
8.35M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
8.35M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
8.32M
        copy_value(in_data, count);
49
8.32M
        _values_num += count;
50
8.32M
        return;
51
8.32M
    }
52
53
    // 1. padding one frame
54
32.7k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
32.7k
    in_data = copy_value(in_data, padding_num);
56
32.7k
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
32.7k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
32.8k
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
16
        _buffered_values_num = FRAME_VALUE_NUM;
63
16
        bit_packing_one_frame_value(in_data);
64
16
        in_data += FRAME_VALUE_NUM;
65
16
    }
66
67
    // 3. process remaining value
68
32.7k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
32.7k
    if (remaining_num > 0) {
70
8
        copy_value(in_data, remaining_num);
71
8
    }
72
73
32.7k
    _values_num += count;
74
32.7k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE9put_batchEPKam
Unexecuted instantiation: _ZN5doris10ForEncoderIsE9put_batchEPKsm
_ZN5doris10ForEncoderIiE9put_batchEPKim
Line
Count
Source
46
14
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
14
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
8
        copy_value(in_data, count);
49
8
        _values_num += count;
50
8
        return;
51
8
    }
52
53
    // 1. padding one frame
54
6
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
6
    in_data = copy_value(in_data, padding_num);
56
6
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
6
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
10
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
4
        _buffered_values_num = FRAME_VALUE_NUM;
63
4
        bit_packing_one_frame_value(in_data);
64
4
        in_data += FRAME_VALUE_NUM;
65
4
    }
66
67
    // 3. process remaining value
68
6
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
6
    if (remaining_num > 0) {
70
2
        copy_value(in_data, remaining_num);
71
2
    }
72
73
6
    _values_num += count;
74
6
}
_ZN5doris10ForEncoderIlE9put_batchEPKlm
Line
Count
Source
46
4.17M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
4.17M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
4.16M
        copy_value(in_data, count);
49
4.16M
        _values_num += count;
50
4.16M
        return;
51
4.16M
    }
52
53
    // 1. padding one frame
54
16.3k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
16.3k
    in_data = copy_value(in_data, padding_num);
56
16.3k
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
16.3k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
16.3k
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
6
        _buffered_values_num = FRAME_VALUE_NUM;
63
6
        bit_packing_one_frame_value(in_data);
64
6
        in_data += FRAME_VALUE_NUM;
65
6
    }
66
67
    // 3. process remaining value
68
16.3k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
16.3k
    if (remaining_num > 0) {
70
6
        copy_value(in_data, remaining_num);
71
6
    }
72
73
16.3k
    _values_num += count;
74
16.3k
}
_ZN5doris10ForEncoderInE9put_batchEPKnm
Line
Count
Source
46
4.17M
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
4.17M
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
4.16M
        copy_value(in_data, count);
49
4.16M
        _values_num += count;
50
4.16M
        return;
51
4.16M
    }
52
53
    // 1. padding one frame
54
16.3k
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
16.3k
    in_data = copy_value(in_data, padding_num);
56
16.3k
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
16.3k
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
16.3k
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
0
        _buffered_values_num = FRAME_VALUE_NUM;
63
0
        bit_packing_one_frame_value(in_data);
64
0
        in_data += FRAME_VALUE_NUM;
65
0
    }
66
67
    // 3. process remaining value
68
16.3k
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
16.3k
    if (remaining_num > 0) {
70
0
        copy_value(in_data, remaining_num);
71
0
    }
72
73
16.3k
    _values_num += count;
74
16.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE9put_batchEPKhm
Unexecuted instantiation: _ZN5doris10ForEncoderItE9put_batchEPKtm
_ZN5doris10ForEncoderIjE9put_batchEPKjm
Line
Count
Source
46
6
void ForEncoder<T>::put_batch(const T* in_data, size_t count) {
47
6
    if (_buffered_values_num + count < FRAME_VALUE_NUM) {
48
0
        copy_value(in_data, count);
49
0
        _values_num += count;
50
0
        return;
51
0
    }
52
53
    // 1. padding one frame
54
6
    size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num;
55
6
    in_data = copy_value(in_data, padding_num);
56
6
    bit_packing_one_frame_value(_buffered_values);
57
58
    // 2. process frame by frame
59
6
    size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM;
60
12
    for (size_t i = 0; i < frame_size; i++) {
61
        // directly encode value to the bit_writer, don't buffer the value
62
6
        _buffered_values_num = FRAME_VALUE_NUM;
63
6
        bit_packing_one_frame_value(in_data);
64
6
        in_data += FRAME_VALUE_NUM;
65
6
    }
66
67
    // 3. process remaining value
68
6
    size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM;
69
6
    if (remaining_num > 0) {
70
0
        copy_value(in_data, remaining_num);
71
0
    }
72
73
6
    _values_num += count;
74
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE9put_batchEPKmm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE9put_batchEPKS1_m
Unexecuted instantiation: _ZN5doris10ForEncoderIoE9put_batchEPKom
75
76
// todo(kks): improve this method by SIMD instructions
77
78
template <typename T>
79
30.6k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
30.6k
    int64_t s = 0;
81
30.6k
    uint8_t output_mask = 255;
82
30.6k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
30.6k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
475k
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
445k
        s |= static_cast<int64_t>(input[i + 7]);
88
445k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
445k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
445k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
445k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
445k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
445k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
445k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
2.44M
        for (int j = 0; j < bit_width; j++) {
100
2.00M
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
2.00M
        }
102
445k
        output += bit_width;
103
445k
        s = 0;
104
445k
    }
105
106
    // remainder
107
30.6k
    int byte = tail_count * bit_width; // How many bits are left to store
108
30.6k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
130k
    for (int i = 0; i < tail_count; i++) {
112
100k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
100k
             << ((tail_count - i - 1) * bit_width);
114
100k
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
30.6k
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
96.8k
    for (int i = 0; i < bytes; i++) {
123
66.2k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
66.2k
    }
125
30.6k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_8EPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_8EPKshiPh
_ZN5doris10ForEncoderIiE10bit_pack_8EPKihiPh
Line
Count
Source
79
16
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
16
    int64_t s = 0;
81
16
    uint8_t output_mask = 255;
82
16
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
16
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
208
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
192
        s |= static_cast<int64_t>(input[i + 7]);
88
192
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
192
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
192
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
192
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
192
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
192
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
192
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
384
        for (int j = 0; j < bit_width; j++) {
100
192
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
192
        }
102
192
        output += bit_width;
103
192
        s = 0;
104
192
    }
105
106
    // remainder
107
16
    int byte = tail_count * bit_width; // How many bits are left to store
108
16
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
20
    for (int i = 0; i < tail_count; i++) {
112
4
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
4
             << ((tail_count - i - 1) * bit_width);
114
4
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
16
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
18
    for (int i = 0; i < bytes; i++) {
123
2
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
2
    }
125
16
}
_ZN5doris10ForEncoderIlE10bit_pack_8EPKlhiPh
Line
Count
Source
79
6.10k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
6.10k
    int64_t s = 0;
81
6.10k
    uint8_t output_mask = 255;
82
6.10k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
6.10k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
69.8k
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
63.7k
        s |= static_cast<int64_t>(input[i + 7]);
88
63.7k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
63.7k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
63.7k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
63.7k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
63.7k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
63.7k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
63.7k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
349k
        for (int j = 0; j < bit_width; j++) {
100
285k
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
285k
        }
102
63.7k
        output += bit_width;
103
63.7k
        s = 0;
104
63.7k
    }
105
106
    // remainder
107
6.10k
    int byte = tail_count * bit_width; // How many bits are left to store
108
6.10k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
20.4k
    for (int i = 0; i < tail_count; i++) {
112
14.3k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
14.3k
             << ((tail_count - i - 1) * bit_width);
114
14.3k
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
6.10k
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
15.5k
    for (int i = 0; i < bytes; i++) {
123
9.44k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
9.44k
    }
125
6.10k
}
_ZN5doris10ForEncoderInE10bit_pack_8EPKnhiPh
Line
Count
Source
79
24.4k
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
24.4k
    int64_t s = 0;
81
24.4k
    uint8_t output_mask = 255;
82
24.4k
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
24.4k
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
405k
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
380k
        s |= static_cast<int64_t>(input[i + 7]);
88
380k
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
380k
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
380k
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
380k
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
380k
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
380k
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
380k
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
2.09M
        for (int j = 0; j < bit_width; j++) {
100
1.71M
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
1.71M
        }
102
380k
        output += bit_width;
103
380k
        s = 0;
104
380k
    }
105
106
    // remainder
107
24.4k
    int byte = tail_count * bit_width; // How many bits are left to store
108
24.4k
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
110k
    for (int i = 0; i < tail_count; i++) {
112
86.0k
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
86.0k
             << ((tail_count - i - 1) * bit_width);
114
86.0k
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
24.4k
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
81.3k
    for (int i = 0; i < bytes; i++) {
123
56.8k
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
56.8k
    }
125
24.4k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_8EPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_8EPKthiPh
_ZN5doris10ForEncoderIjE10bit_pack_8EPKjhiPh
Line
Count
Source
79
12
void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
80
12
    int64_t s = 0;
81
12
    uint8_t output_mask = 255;
82
12
    int tail_count = in_num & 7;              // the remainder of in_num modulo 8
83
12
    int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8
84
85
204
    for (int i = 0; i < full_batch_size; i += 8) {
86
        // Put the 8 numbers in the input into s in order, each number occupies bit_width bit
87
192
        s |= static_cast<int64_t>(input[i + 7]);
88
192
        s |= (static_cast<int64_t>(input[i + 6])) << bit_width;
89
192
        s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width);
90
192
        s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width);
91
192
        s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width);
92
192
        s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width);
93
192
        s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width);
94
192
        s |= (static_cast<int64_t>(input[i])) << (7 * bit_width);
95
96
        // Starting with the highest valid bit, take out 8 bits in sequence
97
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
98
        // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end
99
384
        for (int j = 0; j < bit_width; j++) {
100
192
            output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
101
192
        }
102
192
        output += bit_width;
103
192
        s = 0;
104
192
    }
105
106
    // remainder
107
12
    int byte = tail_count * bit_width; // How many bits are left to store
108
12
    int bytes = (byte + 7) >> 3;       // How many more bytes are needed to store the rest of input
109
110
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
111
12
    for (int i = 0; i < tail_count; i++) {
112
0
        s |= (static_cast<int64_t>(input[i + full_batch_size]))
113
0
             << ((tail_count - i - 1) * bit_width);
114
0
    }
115
116
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
117
12
    s <<= (bytes << 3) - byte;
118
119
    // Starting with the highest valid bit, take out 8 bits in sequence
120
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
121
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
122
12
    for (int i = 0; i < bytes; i++) {
123
0
        output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
124
0
    }
125
12
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_8EPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_8EPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_8EPKohiPh
126
127
template <typename T>
128
template <typename U>
129
91.6k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
91.6k
    U s = 0;
131
91.6k
    uint8_t output_mask = 255;
132
91.6k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
91.6k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
91.6k
    int output_size = 0;                      // How many outputs can be processed at a time
135
91.6k
    int bit_width_remainder =
136
91.6k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
91.6k
    int extra_bit = 0;            // Extra bits after each process
138
139
2.80M
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
2.70M
        s <<= bit_width;
143
2.70M
        s |= (static_cast<U>(input[i]));
144
2.70M
        s <<= bit_width;
145
2.70M
        s |= (static_cast<U>(input[i + 1]));
146
2.70M
        s <<= bit_width;
147
2.70M
        s |= (static_cast<U>(input[i + 2]));
148
2.70M
        s <<= bit_width;
149
2.70M
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
2.70M
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
2.70M
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
30.4M
        for (int j = 0; j < output_size; j++) {
166
27.7M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
27.7M
        }
168
2.70M
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
2.70M
        s &= (1 << extra_bit) - 1;
172
2.70M
    }
173
174
    // remainder
175
91.6k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
91.6k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
91.6k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
220k
    for (int i = 0; i < tail_count; i++) {
181
128k
        s <<= bit_width;
182
128k
        s |= (input[i + full_batch_size]);
183
128k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
91.6k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
462k
    for (int i = 0; i < bytes; i++) {
192
370k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
370k
    }
194
91.6k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4IlEEvPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4InEEvPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4IlEEvPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4InEEvPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4IlEEvPKihiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4InEEvPKihiPh
_ZN5doris10ForEncoderIlE10bit_pack_4IlEEvPKlhiPh
Line
Count
Source
129
6.07k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
6.07k
    U s = 0;
131
6.07k
    uint8_t output_mask = 255;
132
6.07k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
6.07k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
6.07k
    int output_size = 0;                      // How many outputs can be processed at a time
135
6.07k
    int bit_width_remainder =
136
6.07k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
6.07k
    int extra_bit = 0;            // Extra bits after each process
138
139
135k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
129k
        s <<= bit_width;
143
129k
        s |= (static_cast<U>(input[i]));
144
129k
        s <<= bit_width;
145
129k
        s |= (static_cast<U>(input[i + 1]));
146
129k
        s <<= bit_width;
147
129k
        s |= (static_cast<U>(input[i + 2]));
148
129k
        s <<= bit_width;
149
129k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
129k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
129k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
934k
        for (int j = 0; j < output_size; j++) {
166
805k
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
805k
        }
168
129k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
129k
        s &= (1 << extra_bit) - 1;
172
129k
    }
173
174
    // remainder
175
6.07k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
6.07k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
6.07k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
12.1k
    for (int i = 0; i < tail_count; i++) {
181
6.09k
        s <<= bit_width;
182
6.09k
        s |= (input[i + full_batch_size]);
183
6.09k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
6.07k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
17.5k
    for (int i = 0; i < bytes; i++) {
192
11.4k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
11.4k
    }
194
6.07k
}
_ZN5doris10ForEncoderIlE10bit_pack_4InEEvPKlhiPh
Line
Count
Source
129
12.1k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
12.1k
    U s = 0;
131
12.1k
    uint8_t output_mask = 255;
132
12.1k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
12.1k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
12.1k
    int output_size = 0;                      // How many outputs can be processed at a time
135
12.1k
    int bit_width_remainder =
136
12.1k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
12.1k
    int extra_bit = 0;            // Extra bits after each process
138
139
270k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
258k
        s <<= bit_width;
143
258k
        s |= (static_cast<U>(input[i]));
144
258k
        s <<= bit_width;
145
258k
        s |= (static_cast<U>(input[i + 1]));
146
258k
        s <<= bit_width;
147
258k
        s |= (static_cast<U>(input[i + 2]));
148
258k
        s <<= bit_width;
149
258k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
258k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
258k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
3.41M
        for (int j = 0; j < output_size; j++) {
166
3.16M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
3.16M
        }
168
258k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
258k
        s &= (1 << extra_bit) - 1;
172
258k
    }
173
174
    // remainder
175
12.1k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
12.1k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
12.1k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
24.4k
    for (int i = 0; i < tail_count; i++) {
181
12.2k
        s <<= bit_width;
182
12.2k
        s |= (input[i + full_batch_size]);
183
12.2k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
12.1k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
53.4k
    for (int i = 0; i < bytes; i++) {
192
41.3k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
41.3k
    }
194
12.1k
}
_ZN5doris10ForEncoderInE10bit_pack_4IlEEvPKnhiPh
Line
Count
Source
129
24.4k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
24.4k
    U s = 0;
131
24.4k
    uint8_t output_mask = 255;
132
24.4k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
24.4k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
24.4k
    int output_size = 0;                      // How many outputs can be processed at a time
135
24.4k
    int bit_width_remainder =
136
24.4k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
24.4k
    int extra_bit = 0;            // Extra bits after each process
138
139
798k
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
774k
        s <<= bit_width;
143
774k
        s |= (static_cast<U>(input[i]));
144
774k
        s <<= bit_width;
145
774k
        s |= (static_cast<U>(input[i + 1]));
146
774k
        s <<= bit_width;
147
774k
        s |= (static_cast<U>(input[i + 2]));
148
774k
        s <<= bit_width;
149
774k
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
774k
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
774k
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
5.60M
        for (int j = 0; j < output_size; j++) {
166
4.83M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
4.83M
        }
168
774k
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
774k
        s &= (1 << extra_bit) - 1;
172
774k
    }
173
174
    // remainder
175
24.4k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
24.4k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
24.4k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
61.3k
    for (int i = 0; i < tail_count; i++) {
181
36.8k
        s <<= bit_width;
182
36.8k
        s |= (input[i + full_batch_size]);
183
36.8k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
24.4k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
93.6k
    for (int i = 0; i < bytes; i++) {
192
69.1k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
69.1k
    }
194
24.4k
}
_ZN5doris10ForEncoderInE10bit_pack_4InEEvPKnhiPh
Line
Count
Source
129
48.9k
void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
130
48.9k
    U s = 0;
131
48.9k
    uint8_t output_mask = 255;
132
48.9k
    int tail_count = in_num & 3;              // the remainder of in_num modulo 4
133
48.9k
    int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4
134
48.9k
    int output_size = 0;                      // How many outputs can be processed at a time
135
48.9k
    int bit_width_remainder =
136
48.9k
            (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
137
48.9k
    int extra_bit = 0;            // Extra bits after each process
138
139
1.59M
    for (int i = 0; i < full_batch_size; i += 4) {
140
        // Put the 4 numbers in the input into s in order, each number occupies bit_width bit
141
        // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop
142
1.54M
        s <<= bit_width;
143
1.54M
        s |= (static_cast<U>(input[i]));
144
1.54M
        s <<= bit_width;
145
1.54M
        s |= (static_cast<U>(input[i + 1]));
146
1.54M
        s <<= bit_width;
147
1.54M
        s |= (static_cast<U>(input[i + 2]));
148
1.54M
        s <<= bit_width;
149
1.54M
        s |= (static_cast<U>(input[i + 3]));
150
151
        // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s,
152
        // and there are extra_bit bits left over from the last loop,
153
        // divide by 8 to calculate how much output can be processed in this loop.
154
1.54M
        output_size = ((bit_width << 2) + extra_bit) >> 3;
155
156
        // Each loop will leave bit_width_remainder bit unprocessed,
157
        // last loop will leave extra_bit bit, eventually will leave
158
        // (extra_bit + bit_width_remainder) & 7 bit unprocessed
159
1.54M
        extra_bit = (extra_bit + bit_width_remainder) & 7;
160
161
        // Starting with the highest valid bit, take out 8 bits in sequence
162
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
163
        // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end
164
        // But since there are still extra_bit bits that can't be processed, need to add the extra_bit
165
20.5M
        for (int j = 0; j < output_size; j++) {
166
18.9M
            output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
167
18.9M
        }
168
1.54M
        output += output_size;
169
170
        // s retains the post extra_bit bit as it is not processed
171
1.54M
        s &= (1 << extra_bit) - 1;
172
1.54M
    }
173
174
    // remainder
175
48.9k
    int byte = tail_count * bit_width;     // How many bits are left to store
176
48.9k
    if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed
177
48.9k
    int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input
178
179
    // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit
180
122k
    for (int i = 0; i < tail_count; i++) {
181
73.7k
        s <<= bit_width;
182
73.7k
        s |= (input[i + full_batch_size]);
183
73.7k
    }
184
185
    // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end
186
48.9k
    s <<= (bytes << 3) - byte;
187
188
    // Starting with the highest valid bit, take out 8 bits in sequence
189
    // perform an AND operation with output_mask to ensure that only 8 bits are valid.
190
    // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end
191
297k
    for (int i = 0; i < bytes; i++) {
192
248k
        output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
193
248k
    }
194
48.9k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4IlEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4InEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4IlEEvPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4InEEvPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4IlEEvPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4InEEvPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4IlEEvPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4InEEvPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4IlEEvPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4InEEvPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4IlEEvPKohiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4InEEvPKohiPh
195
196
template <typename T>
197
363k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
198
363k
    int output_mask = 255;
199
363k
    int need_bit = 0; // still need
200
201
43.8M
    for (int i = 0; i < in_num; i++) {
202
43.4M
        T x = input[i];
203
43.4M
        int width = bit_width;
204
43.4M
        if (need_bit) {
205
            // The last time we take away the high 8 - need_bit,
206
            // we need to make up the rest of the need_bit from the width.
207
            // Use width - need_bit to compute high need_bit bits
208
30.0M
            *output |= x >> (width - need_bit);
209
30.0M
            output++;
210
            // There are need_bit bits being used, so subtract
211
30.0M
            width -= need_bit;
212
30.0M
        }
213
43.4M
        int num = width >> 3;      // How many outputs can be processed at a time
214
43.4M
        int remainder = width & 7; // How many bits are left to store
215
216
        // Starting with the highest valid bit, take out 8 bits in sequence
217
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
218
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
219
        // But since there are still remainder bits that can't be processed, need to add the remainder
220
447M
        for (int j = 0; j < num; j++) {
221
404M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
222
404M
            output++;
223
404M
        }
224
43.4M
        if (remainder) {
225
            // Process the last remaining remainder bit.
226
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
227
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
228
30.3M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
229
            // Already have remainder bits, next time need 8-remainder bits
230
30.3M
            need_bit = 8 - remainder;
231
30.3M
        } else {
232
13.1M
            need_bit = 0;
233
13.1M
        }
234
43.4M
    }
235
363k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_1EPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_1EPKshiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_1EPKihiPh
_ZN5doris10ForEncoderIlE10bit_pack_1EPKlhiPh
Line
Count
Source
197
24.3k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
198
24.3k
    int output_mask = 255;
199
24.3k
    int need_bit = 0; // still need
200
201
2.11M
    for (int i = 0; i < in_num; i++) {
202
2.08M
        T x = input[i];
203
2.08M
        int width = bit_width;
204
2.08M
        if (need_bit) {
205
            // The last time we take away the high 8 - need_bit,
206
            // we need to make up the rest of the need_bit from the width.
207
            // Use width - need_bit to compute high need_bit bits
208
1.48M
            *output |= x >> (width - need_bit);
209
1.48M
            output++;
210
            // There are need_bit bits being used, so subtract
211
1.48M
            width -= need_bit;
212
1.48M
        }
213
2.08M
        int num = width >> 3;      // How many outputs can be processed at a time
214
2.08M
        int remainder = width & 7; // How many bits are left to store
215
216
        // Starting with the highest valid bit, take out 8 bits in sequence
217
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
218
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
219
        // But since there are still remainder bits that can't be processed, need to add the remainder
220
13.2M
        for (int j = 0; j < num; j++) {
221
11.1M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
222
11.1M
            output++;
223
11.1M
        }
224
2.08M
        if (remainder) {
225
            // Process the last remaining remainder bit.
226
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
227
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
228
1.49M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
229
            // Already have remainder bits, next time need 8-remainder bits
230
1.49M
            need_bit = 8 - remainder;
231
1.49M
        } else {
232
589k
            need_bit = 0;
233
589k
        }
234
2.08M
    }
235
24.3k
}
_ZN5doris10ForEncoderInE10bit_pack_1EPKnhiPh
Line
Count
Source
197
339k
void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
198
339k
    int output_mask = 255;
199
339k
    int need_bit = 0; // still need
200
201
41.7M
    for (int i = 0; i < in_num; i++) {
202
41.3M
        T x = input[i];
203
41.3M
        int width = bit_width;
204
41.3M
        if (need_bit) {
205
            // The last time we take away the high 8 - need_bit,
206
            // we need to make up the rest of the need_bit from the width.
207
            // Use width - need_bit to compute high need_bit bits
208
28.6M
            *output |= x >> (width - need_bit);
209
28.6M
            output++;
210
            // There are need_bit bits being used, so subtract
211
28.6M
            width -= need_bit;
212
28.6M
        }
213
41.3M
        int num = width >> 3;      // How many outputs can be processed at a time
214
41.3M
        int remainder = width & 7; // How many bits are left to store
215
216
        // Starting with the highest valid bit, take out 8 bits in sequence
217
        // perform an AND operation with output_mask to ensure that only 8 bits are valid
218
        // (num-j-1)<<3 used to calculate how many bits need to be removed at the end
219
        // But since there are still remainder bits that can't be processed, need to add the remainder
220
434M
        for (int j = 0; j < num; j++) {
221
393M
            *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask);
222
393M
            output++;
223
393M
        }
224
41.3M
        if (remainder) {
225
            // Process the last remaining remainder bit.
226
            // y = (x & ((1 << remainder) - 1)) extract the last remainder bits.
227
            // ouput = y << (8 - reaminder)  Use the high 8 - remainder bit
228
28.8M
            *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder));
229
            // Already have remainder bits, next time need 8-remainder bits
230
28.8M
            need_bit = 8 - remainder;
231
28.8M
        } else {
232
12.5M
            need_bit = 0;
233
12.5M
        }
234
41.3M
    }
235
339k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_1EPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_1EPKthiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_1EPKjhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_1EPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_1EPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_1EPKohiPh
236
237
// Use as few bit as possible to store a piece of integer data.
238
// param[in] input: the integer list need to pack
239
// param[in] in_num: the number integer need to pack
240
// param[in] bit_width: how many bit we use to store each integer data
241
// param[out] out: the packed result
242
243
// For example:
244
// The input is int32 list: 1, 2, 4, 8 and bit_width is 4
245
// The output will be: 0001 0010 0100 1000
246
template <typename T>
247
486k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
486k
    if (in_num == 0 || bit_width == 0) {
249
520
        return;
250
520
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
485k
    if (bit_width <= 8) {
259
30.6k
        bit_pack_8(input, in_num, bit_width, output);
260
455k
    } else if (bit_width <= 16) {
261
30.5k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
424k
    } else if (bit_width <= 32) {
263
61.1k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
363k
    } else {
265
363k
        bit_pack_1(input, in_num, bit_width, output);
266
363k
    }
267
485k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE8bit_packEPKahiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIsE8bit_packEPKshiPh
_ZN5doris10ForEncoderIiE8bit_packEPKihiPh
Line
Count
Source
247
18
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
18
    if (in_num == 0 || bit_width == 0) {
249
2
        return;
250
2
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
16
    if (bit_width <= 8) {
259
16
        bit_pack_8(input, in_num, bit_width, output);
260
16
    } else if (bit_width <= 16) {
261
0
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
0
    } else if (bit_width <= 32) {
263
0
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
0
    } else {
265
0
        bit_pack_1(input, in_num, bit_width, output);
266
0
    }
267
16
}
_ZN5doris10ForEncoderIlE8bit_packEPKlhiPh
Line
Count
Source
247
48.9k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
48.9k
    if (in_num == 0 || bit_width == 0) {
249
262
        return;
250
262
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
48.6k
    if (bit_width <= 8) {
259
6.10k
        bit_pack_8(input, in_num, bit_width, output);
260
42.5k
    } else if (bit_width <= 16) {
261
6.07k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
36.4k
    } else if (bit_width <= 32) {
263
12.1k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
24.3k
    } else {
265
24.3k
        bit_pack_1(input, in_num, bit_width, output);
266
24.3k
    }
267
48.6k
}
_ZN5doris10ForEncoderInE8bit_packEPKnhiPh
Line
Count
Source
247
437k
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
437k
    if (in_num == 0 || bit_width == 0) {
249
256
        return;
250
256
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
437k
    if (bit_width <= 8) {
259
24.4k
        bit_pack_8(input, in_num, bit_width, output);
260
412k
    } else if (bit_width <= 16) {
261
24.4k
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
388k
    } else if (bit_width <= 32) {
263
48.9k
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
339k
    } else {
265
339k
        bit_pack_1(input, in_num, bit_width, output);
266
339k
    }
267
437k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE8bit_packEPKhhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderItE8bit_packEPKthiPh
_ZN5doris10ForEncoderIjE8bit_packEPKjhiPh
Line
Count
Source
247
12
void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
248
12
    if (in_num == 0 || bit_width == 0) {
249
0
        return;
250
0
    }
251
    /*
252
        bit_width <= 8 : pack_8 > pack_16 > pack_32
253
        bit_width <= 16 : pack_4 > pack_8 > pack_16
254
        bit_width <= 32 : pack_4 >= pack_2 > pack_8 
255
        (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently)
256
        bit_width <= 64 : pack_1 > pack_4
257
    */
258
12
    if (bit_width <= 8) {
259
12
        bit_pack_8(input, in_num, bit_width, output);
260
12
    } else if (bit_width <= 16) {
261
0
        bit_pack_4<int64_t>(input, in_num, bit_width, output);
262
0
    } else if (bit_width <= 32) {
263
0
        bit_pack_4<__int128_t>(input, in_num, bit_width, output);
264
0
    } else {
265
0
        bit_pack_1(input, in_num, bit_width, output);
266
0
    }
267
12
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE8bit_packEPKmhiPh
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE8bit_packEPKS1_hiPh
Unexecuted instantiation: _ZN5doris10ForEncoderIoE8bit_packEPKohiPh
268
269
template <typename T>
270
97.8k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
97.8k
    T min = input[0];
272
97.8k
    T max = input[0];
273
97.8k
    bool is_ascending = true;
274
97.8k
    uint8_t bit_width = 0;
275
97.8k
    T half_max_delta = numeric_limits_max() >> 1;
276
97.8k
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
8.36M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
8.26M
        if (is_ascending) {
281
172k
            if (input[i] < input[i - 1]) {
282
96.9k
                is_ascending = false;
283
96.9k
            } else {
284
76.0k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
76.0k
                } else {
287
76.0k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
76.0k
                }
289
76.0k
            }
290
172k
        }
291
292
8.26M
        if (input[i] < min) {
293
361k
            min = input[i];
294
361k
            continue;
295
361k
        }
296
297
7.90M
        if (input[i] > max) {
298
367k
            max = input[i];
299
367k
        }
300
7.90M
    }
301
97.8k
    if (!is_ascending) {
302
96.9k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
96.9k
    }
306
307
    // 2. save min value.
308
97.8k
    if (sizeof(T) == 16) {
309
48.8k
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
48.9k
    } else if (sizeof(T) == 8) {
311
48.9k
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
48.9k
    } else {
313
30
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
30
    }
315
316
    // 3.1 save original value.
317
97.8k
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
97.8k
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
97.8k
        T delta_values[FRAME_VALUE_NUM];
328
97.8k
        if (is_ascending) {
329
898
            delta_values[0] = 0;
330
6.34k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
5.45k
                delta_values[i] = input[i] - input[i - 1];
332
5.45k
            }
333
96.9k
        } else {
334
96.9k
            bit_width = bits(static_cast<T>(max - min));
335
8.45M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
8.35M
                delta_values[i] = input[i] - min;
337
8.35M
            }
338
96.9k
        }
339
340
97.8k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
97.8k
        _buffer->reserve(_buffer->size() + packing_len);
343
97.8k
        size_t origin_size = _buffer->size();
344
97.8k
        _buffer->resize(origin_size + packing_len);
345
97.8k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
97.8k
    }
347
97.8k
    uint8_t storage_format = 0;
348
97.8k
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
97.8k
    } else if (is_ascending) {
351
898
        storage_format = 1;
352
898
    }
353
97.8k
    _storage_formats.push_back(storage_format);
354
97.8k
    _bit_widths.push_back(bit_width);
355
356
97.8k
    _buffered_values_num = 0;
357
97.8k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE27bit_packing_one_frame_valueEPKa
Unexecuted instantiation: _ZN5doris10ForEncoderIsE27bit_packing_one_frame_valueEPKs
_ZN5doris10ForEncoderIiE27bit_packing_one_frame_valueEPKi
Line
Count
Source
270
18
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
18
    T min = input[0];
272
18
    T max = input[0];
273
18
    bool is_ascending = true;
274
18
    uint8_t bit_width = 0;
275
18
    T half_max_delta = numeric_limits_max() >> 1;
276
18
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
1.54k
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
1.52k
        if (is_ascending) {
281
1.52k
            if (input[i] < input[i - 1]) {
282
0
                is_ascending = false;
283
1.52k
            } else {
284
1.52k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
1.52k
                } else {
287
1.52k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
1.52k
                }
289
1.52k
            }
290
1.52k
        }
291
292
1.52k
        if (input[i] < min) {
293
0
            min = input[i];
294
0
            continue;
295
0
        }
296
297
1.52k
        if (input[i] > max) {
298
1.52k
            max = input[i];
299
1.52k
        }
300
1.52k
    }
301
18
    if (!is_ascending) {
302
0
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
0
    }
306
307
    // 2. save min value.
308
18
    if (sizeof(T) == 16) {
309
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
18
    } else if (sizeof(T) == 8) {
311
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
18
    } else {
313
18
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
18
    }
315
316
    // 3.1 save original value.
317
18
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
18
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
18
        T delta_values[FRAME_VALUE_NUM];
328
18
        if (is_ascending) {
329
18
            delta_values[0] = 0;
330
1.54k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
1.52k
                delta_values[i] = input[i] - input[i - 1];
332
1.52k
            }
333
18
        } else {
334
0
            bit_width = bits(static_cast<T>(max - min));
335
0
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
0
                delta_values[i] = input[i] - min;
337
0
            }
338
0
        }
339
340
18
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
18
        _buffer->reserve(_buffer->size() + packing_len);
343
18
        size_t origin_size = _buffer->size();
344
18
        _buffer->resize(origin_size + packing_len);
345
18
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
18
    }
347
18
    uint8_t storage_format = 0;
348
18
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
18
    } else if (is_ascending) {
351
18
        storage_format = 1;
352
18
    }
353
18
    _storage_formats.push_back(storage_format);
354
18
    _bit_widths.push_back(bit_width);
355
356
18
    _buffered_values_num = 0;
357
18
}
_ZN5doris10ForEncoderIlE27bit_packing_one_frame_valueEPKl
Line
Count
Source
270
48.9k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
48.9k
    T min = input[0];
272
48.9k
    T max = input[0];
273
48.9k
    bool is_ascending = true;
274
48.9k
    uint8_t bit_width = 0;
275
48.9k
    T half_max_delta = numeric_limits_max() >> 1;
276
48.9k
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
4.17M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
4.13M
        if (is_ascending) {
281
86.7k
            if (input[i] < input[i - 1]) {
282
48.4k
                is_ascending = false;
283
48.4k
            } else {
284
38.2k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
38.2k
                } else {
287
38.2k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
38.2k
                }
289
38.2k
            }
290
86.7k
        }
291
292
4.13M
        if (input[i] < min) {
293
176k
            min = input[i];
294
176k
            continue;
295
176k
        }
296
297
3.95M
        if (input[i] > max) {
298
179k
            max = input[i];
299
179k
        }
300
3.95M
    }
301
48.9k
    if (!is_ascending) {
302
48.4k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
48.4k
    }
306
307
    // 2. save min value.
308
48.9k
    if (sizeof(T) == 16) {
309
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
48.9k
    } else if (sizeof(T) == 8) {
311
48.9k
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
48.9k
    } else {
313
0
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
0
    }
315
316
    // 3.1 save original value.
317
48.9k
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
48.9k
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
48.9k
        T delta_values[FRAME_VALUE_NUM];
328
48.9k
        if (is_ascending) {
329
440
            delta_values[0] = 0;
330
2.59k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
2.15k
                delta_values[i] = input[i] - input[i - 1];
332
2.15k
            }
333
48.4k
        } else {
334
48.4k
            bit_width = bits(static_cast<T>(max - min));
335
4.22M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
4.17M
                delta_values[i] = input[i] - min;
337
4.17M
            }
338
48.4k
        }
339
340
48.9k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
48.9k
        _buffer->reserve(_buffer->size() + packing_len);
343
48.9k
        size_t origin_size = _buffer->size();
344
48.9k
        _buffer->resize(origin_size + packing_len);
345
48.9k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
48.9k
    }
347
48.9k
    uint8_t storage_format = 0;
348
48.9k
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
48.9k
    } else if (is_ascending) {
351
440
        storage_format = 1;
352
440
    }
353
48.9k
    _storage_formats.push_back(storage_format);
354
48.9k
    _bit_widths.push_back(bit_width);
355
356
48.9k
    _buffered_values_num = 0;
357
48.9k
}
_ZN5doris10ForEncoderInE27bit_packing_one_frame_valueEPKn
Line
Count
Source
270
48.8k
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
48.8k
    T min = input[0];
272
48.8k
    T max = input[0];
273
48.8k
    bool is_ascending = true;
274
48.8k
    uint8_t bit_width = 0;
275
48.8k
    T half_max_delta = numeric_limits_max() >> 1;
276
48.8k
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
4.17M
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
4.12M
        if (is_ascending) {
281
83.1k
            if (input[i] < input[i - 1]) {
282
48.4k
                is_ascending = false;
283
48.4k
            } else {
284
34.7k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
34.7k
                } else {
287
34.7k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
34.7k
                }
289
34.7k
            }
290
83.1k
        }
291
292
4.12M
        if (input[i] < min) {
293
185k
            min = input[i];
294
185k
            continue;
295
185k
        }
296
297
3.94M
        if (input[i] > max) {
298
185k
            max = input[i];
299
185k
        }
300
3.94M
    }
301
48.8k
    if (!is_ascending) {
302
48.4k
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
48.4k
    }
306
307
    // 2. save min value.
308
48.8k
    if (sizeof(T) == 16) {
309
48.8k
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
48.8k
    } else if (sizeof(T) == 8) {
311
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
0
    } else {
313
0
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
0
    }
315
316
    // 3.1 save original value.
317
48.8k
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
48.8k
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
48.8k
        T delta_values[FRAME_VALUE_NUM];
328
48.8k
        if (is_ascending) {
329
428
            delta_values[0] = 0;
330
676
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
248
                delta_values[i] = input[i] - input[i - 1];
332
248
            }
333
48.4k
        } else {
334
48.4k
            bit_width = bits(static_cast<T>(max - min));
335
4.22M
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
4.17M
                delta_values[i] = input[i] - min;
337
4.17M
            }
338
48.4k
        }
339
340
48.8k
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
48.8k
        _buffer->reserve(_buffer->size() + packing_len);
343
48.8k
        size_t origin_size = _buffer->size();
344
48.8k
        _buffer->resize(origin_size + packing_len);
345
48.8k
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
48.8k
    }
347
48.8k
    uint8_t storage_format = 0;
348
48.8k
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
48.8k
    } else if (is_ascending) {
351
428
        storage_format = 1;
352
428
    }
353
48.8k
    _storage_formats.push_back(storage_format);
354
48.8k
    _bit_widths.push_back(bit_width);
355
356
48.8k
    _buffered_values_num = 0;
357
48.8k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE27bit_packing_one_frame_valueEPKh
Unexecuted instantiation: _ZN5doris10ForEncoderItE27bit_packing_one_frame_valueEPKt
_ZN5doris10ForEncoderIjE27bit_packing_one_frame_valueEPKj
Line
Count
Source
270
12
void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
271
12
    T min = input[0];
272
12
    T max = input[0];
273
12
    bool is_ascending = true;
274
12
    uint8_t bit_width = 0;
275
12
    T half_max_delta = numeric_limits_max() >> 1;
276
12
    bool is_keep_original_value = false;
277
278
    // 1. make sure order_flag, save_original_value, and find max&min.
279
1.53k
    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
280
1.52k
        if (is_ascending) {
281
1.52k
            if (input[i] < input[i - 1]) {
282
0
                is_ascending = false;
283
1.52k
            } else {
284
1.52k
                if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow
285
0
                    is_keep_original_value = true;
286
1.52k
                } else {
287
1.52k
                    bit_width = std::max(bit_width, bits(input[i] - input[i - 1]));
288
1.52k
                }
289
1.52k
            }
290
1.52k
        }
291
292
1.52k
        if (input[i] < min) {
293
0
            min = input[i];
294
0
            continue;
295
0
        }
296
297
1.52k
        if (input[i] > max) {
298
1.52k
            max = input[i];
299
1.52k
        }
300
1.52k
    }
301
12
    if (!is_ascending) {
302
0
        if ((max >> 1) - (min >> 1) > half_max_delta) {
303
0
            is_keep_original_value = true;
304
0
        }
305
0
    }
306
307
    // 2. save min value.
308
12
    if (sizeof(T) == 16) {
309
0
        put_fixed128_le(_buffer, static_cast<uint128_t>(min));
310
12
    } else if (sizeof(T) == 8) {
311
0
        put_fixed64_le(_buffer, static_cast<uint64_t>(min));
312
12
    } else {
313
12
        put_fixed32_le(_buffer, static_cast<uint32_t>(min));
314
12
    }
315
316
    // 3.1 save original value.
317
12
    if (is_keep_original_value) {
318
0
        bit_width = sizeof(T) * 8;
319
0
        uint32_t len = _buffered_values_num * bit_width;
320
0
        _buffer->reserve(_buffer->size() + len);
321
0
        size_t origin_size = _buffer->size();
322
0
        _buffer->resize(origin_size + len);
323
0
        bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size);
324
12
    } else {
325
        // 3.2 bit pack.
326
        // improve for ascending order input, we could use fewer bit
327
12
        T delta_values[FRAME_VALUE_NUM];
328
12
        if (is_ascending) {
329
12
            delta_values[0] = 0;
330
1.53k
            for (uint8_t i = 1; i < _buffered_values_num; ++i) {
331
1.52k
                delta_values[i] = input[i] - input[i - 1];
332
1.52k
            }
333
12
        } else {
334
0
            bit_width = bits(static_cast<T>(max - min));
335
0
            for (uint8_t i = 0; i < _buffered_values_num; ++i) {
336
0
                delta_values[i] = input[i] - min;
337
0
            }
338
0
        }
339
340
12
        uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8);
341
342
12
        _buffer->reserve(_buffer->size() + packing_len);
343
12
        size_t origin_size = _buffer->size();
344
12
        _buffer->resize(origin_size + packing_len);
345
12
        bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size);
346
12
    }
347
12
    uint8_t storage_format = 0;
348
12
    if (is_keep_original_value) {
349
0
        storage_format = 2;
350
12
    } else if (is_ascending) {
351
12
        storage_format = 1;
352
12
    }
353
12
    _storage_formats.push_back(storage_format);
354
12
    _bit_widths.push_back(bit_width);
355
356
12
    _buffered_values_num = 0;
357
12
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE27bit_packing_one_frame_valueEPKm
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE27bit_packing_one_frame_valueEPKS1_
Unexecuted instantiation: _ZN5doris10ForEncoderIoE27bit_packing_one_frame_valueEPKo
358
359
template <typename T>
360
65.3k
uint32_t ForEncoder<T>::flush() {
361
65.3k
    if (_buffered_values_num != 0) {
362
65.0k
        bit_packing_one_frame_value(_buffered_values);
363
65.0k
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
65.3k
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
163k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
97.8k
        _buffer->append(&_storage_formats[i], 1);
371
97.8k
        _buffer->append(&_bit_widths[i], 1);
372
97.8k
    }
373
    // 2 frame_value_num and values_num
374
65.3k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
65.3k
    _buffer->append(&frame_value_num, 1);
376
65.3k
    put_fixed32_le(_buffer, _values_num);
377
378
65.3k
    return cast_set<uint32_t>(_buffer->size());
379
65.3k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderIsE5flushEv
_ZN5doris10ForEncoderIiE5flushEv
Line
Count
Source
360
14
uint32_t ForEncoder<T>::flush() {
361
14
    if (_buffered_values_num != 0) {
362
8
        bit_packing_one_frame_value(_buffered_values);
363
8
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
14
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
32
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
18
        _buffer->append(&_storage_formats[i], 1);
371
18
        _buffer->append(&_bit_widths[i], 1);
372
18
    }
373
    // 2 frame_value_num and values_num
374
14
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
14
    _buffer->append(&frame_value_num, 1);
376
14
    put_fixed32_le(_buffer, _values_num);
377
378
14
    return cast_set<uint32_t>(_buffer->size());
379
14
}
_ZN5doris10ForEncoderIlE5flushEv
Line
Count
Source
360
32.6k
uint32_t ForEncoder<T>::flush() {
361
32.6k
    if (_buffered_values_num != 0) {
362
32.5k
        bit_packing_one_frame_value(_buffered_values);
363
32.5k
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
32.6k
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
81.5k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
48.9k
        _buffer->append(&_storage_formats[i], 1);
371
48.9k
        _buffer->append(&_bit_widths[i], 1);
372
48.9k
    }
373
    // 2 frame_value_num and values_num
374
32.6k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
32.6k
    _buffer->append(&frame_value_num, 1);
376
32.6k
    put_fixed32_le(_buffer, _values_num);
377
378
32.6k
    return cast_set<uint32_t>(_buffer->size());
379
32.6k
}
_ZN5doris10ForEncoderInE5flushEv
Line
Count
Source
360
32.6k
uint32_t ForEncoder<T>::flush() {
361
32.6k
    if (_buffered_values_num != 0) {
362
32.5k
        bit_packing_one_frame_value(_buffered_values);
363
32.5k
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
32.6k
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
81.5k
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
48.8k
        _buffer->append(&_storage_formats[i], 1);
371
48.8k
        _buffer->append(&_bit_widths[i], 1);
372
48.8k
    }
373
    // 2 frame_value_num and values_num
374
32.6k
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
32.6k
    _buffer->append(&frame_value_num, 1);
376
32.6k
    put_fixed32_le(_buffer, _values_num);
377
378
32.6k
    return cast_set<uint32_t>(_buffer->size());
379
32.6k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderItE5flushEv
_ZN5doris10ForEncoderIjE5flushEv
Line
Count
Source
360
6
uint32_t ForEncoder<T>::flush() {
361
6
    if (_buffered_values_num != 0) {
362
0
        bit_packing_one_frame_value(_buffered_values);
363
0
    }
364
365
    // write the footer:
366
    // 1 _storage_formats and bit_widths
367
6
    DCHECK(_storage_formats.size() == _bit_widths.size())
368
0
            << "Size of _storage_formats and _bit_widths should be equal.";
369
18
    for (size_t i = 0; i < _storage_formats.size(); i++) {
370
12
        _buffer->append(&_storage_formats[i], 1);
371
12
        _buffer->append(&_bit_widths[i], 1);
372
12
    }
373
    // 2 frame_value_num and values_num
374
6
    uint8_t frame_value_num = FRAME_VALUE_NUM;
375
6
    _buffer->append(&frame_value_num, 1);
376
6
    put_fixed32_le(_buffer, _values_num);
377
378
6
    return cast_set<uint32_t>(_buffer->size());
379
6
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE5flushEv
Unexecuted instantiation: _ZN5doris10ForEncoderIoE5flushEv
380
381
template <typename T>
382
97.8k
const T ForEncoder<T>::numeric_limits_max() {
383
97.8k
    return std::numeric_limits<T>::max();
384
97.8k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIaE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderIsE18numeric_limits_maxEv
_ZN5doris10ForEncoderIiE18numeric_limits_maxEv
Line
Count
Source
382
18
const T ForEncoder<T>::numeric_limits_max() {
383
18
    return std::numeric_limits<T>::max();
384
18
}
_ZN5doris10ForEncoderIlE18numeric_limits_maxEv
Line
Count
Source
382
48.9k
const T ForEncoder<T>::numeric_limits_max() {
383
48.9k
    return std::numeric_limits<T>::max();
384
48.9k
}
_ZN5doris10ForEncoderInE18numeric_limits_maxEv
Line
Count
Source
382
48.8k
const T ForEncoder<T>::numeric_limits_max() {
383
48.8k
    return std::numeric_limits<T>::max();
384
48.8k
}
Unexecuted instantiation: _ZN5doris10ForEncoderIhE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderItE18numeric_limits_maxEv
_ZN5doris10ForEncoderIjE18numeric_limits_maxEv
Line
Count
Source
382
12
const T ForEncoder<T>::numeric_limits_max() {
383
12
    return std::numeric_limits<T>::max();
384
12
}
Unexecuted instantiation: _ZN5doris10ForEncoderImE18numeric_limits_maxEv
Unexecuted instantiation: _ZN5doris10ForEncoderIoE18numeric_limits_maxEv
385
386
template <>
387
0
const uint24_t ForEncoder<uint24_t>::numeric_limits_max() {
388
0
    return 0XFFFFFF;
389
0
}
390
391
template <typename T>
392
65.3k
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
65.3k
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
65.3k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
65.3k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
65.3k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
65.3k
    _last_frame_size =
403
65.3k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
65.3k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
65.3k
    u_int32_t frame_start_offset = 0;
409
163k
    for (uint32_t i = 0; i < _frame_count; i++) {
410
97.8k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
97.8k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
97.8k
        _bit_widths.push_back(bit_width);
413
97.8k
        _storage_formats.push_back(order_flag);
414
415
97.8k
        bit_width_offset += 2;
416
417
97.8k
        _frame_offsets.push_back(frame_start_offset);
418
97.8k
        if (sizeof(T) == 16) {
419
48.8k
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
48.9k
        } else if (sizeof(T) == 8) {
421
48.9k
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
48.9k
        } else {
423
30
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
30
        }
425
97.8k
    }
426
427
65.3k
    _out_buffer.resize(_max_frame_size);
428
65.3k
    _parsed = true;
429
430
65.3k
    return true;
431
65.3k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderIsE4initEv
_ZN5doris10ForDecoderIiE4initEv
Line
Count
Source
392
14
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
14
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
14
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
14
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
14
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
14
    _last_frame_size =
403
14
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
14
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
14
    u_int32_t frame_start_offset = 0;
409
32
    for (uint32_t i = 0; i < _frame_count; i++) {
410
18
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
18
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
18
        _bit_widths.push_back(bit_width);
413
18
        _storage_formats.push_back(order_flag);
414
415
18
        bit_width_offset += 2;
416
417
18
        _frame_offsets.push_back(frame_start_offset);
418
18
        if (sizeof(T) == 16) {
419
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
18
        } else if (sizeof(T) == 8) {
421
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
18
        } else {
423
18
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
18
        }
425
18
    }
426
427
14
    _out_buffer.resize(_max_frame_size);
428
14
    _parsed = true;
429
430
14
    return true;
431
14
}
_ZN5doris10ForDecoderIlE4initEv
Line
Count
Source
392
32.6k
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
32.6k
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
32.6k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
32.6k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
32.6k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
32.6k
    _last_frame_size =
403
32.6k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
32.6k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
32.6k
    u_int32_t frame_start_offset = 0;
409
81.5k
    for (uint32_t i = 0; i < _frame_count; i++) {
410
48.9k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
48.9k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
48.9k
        _bit_widths.push_back(bit_width);
413
48.9k
        _storage_formats.push_back(order_flag);
414
415
48.9k
        bit_width_offset += 2;
416
417
48.9k
        _frame_offsets.push_back(frame_start_offset);
418
48.9k
        if (sizeof(T) == 16) {
419
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
48.9k
        } else if (sizeof(T) == 8) {
421
48.9k
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
48.9k
        } else {
423
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
0
        }
425
48.9k
    }
426
427
32.6k
    _out_buffer.resize(_max_frame_size);
428
32.6k
    _parsed = true;
429
430
32.6k
    return true;
431
32.6k
}
_ZN5doris10ForDecoderInE4initEv
Line
Count
Source
392
32.6k
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
32.6k
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
32.6k
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
32.6k
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
32.6k
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
32.6k
    _last_frame_size =
403
32.6k
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
32.6k
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
32.6k
    u_int32_t frame_start_offset = 0;
409
81.5k
    for (uint32_t i = 0; i < _frame_count; i++) {
410
48.8k
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
48.8k
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
48.8k
        _bit_widths.push_back(bit_width);
413
48.8k
        _storage_formats.push_back(order_flag);
414
415
48.8k
        bit_width_offset += 2;
416
417
48.8k
        _frame_offsets.push_back(frame_start_offset);
418
48.8k
        if (sizeof(T) == 16) {
419
48.8k
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
48.8k
        } else if (sizeof(T) == 8) {
421
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
0
        } else {
423
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
0
        }
425
48.8k
    }
426
427
32.6k
    _out_buffer.resize(_max_frame_size);
428
32.6k
    _parsed = true;
429
430
32.6k
    return true;
431
32.6k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderItE4initEv
_ZN5doris10ForDecoderIjE4initEv
Line
Count
Source
392
6
bool ForDecoder<T>::init() {
393
    // When row count is zero, the minimum footer size is 5:
394
    // only has ValuesNum(4) + FrameValueNum(1)
395
6
    if (_buffer_len < 5) {
396
0
        return false;
397
0
    }
398
399
6
    _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5);
400
6
    _values_num = decode_fixed32_le(_buffer + _buffer_len - 4);
401
6
    _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0);
402
6
    _last_frame_size =
403
6
            cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num));
404
405
6
    size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2;
406
407
    // read _storage_formats, bit_widths and compute frame_offsets
408
6
    u_int32_t frame_start_offset = 0;
409
18
    for (uint32_t i = 0; i < _frame_count; i++) {
410
12
        uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset);
411
12
        uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1);
412
12
        _bit_widths.push_back(bit_width);
413
12
        _storage_formats.push_back(order_flag);
414
415
12
        bit_width_offset += 2;
416
417
12
        _frame_offsets.push_back(frame_start_offset);
418
12
        if (sizeof(T) == 16) {
419
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 16;
420
12
        } else if (sizeof(T) == 8) {
421
0
            frame_start_offset += bit_width * _max_frame_size / 8 + 8;
422
12
        } else {
423
12
            frame_start_offset += bit_width * _max_frame_size / 8 + 4;
424
12
        }
425
12
    }
426
427
6
    _out_buffer.resize(_max_frame_size);
428
6
    _parsed = true;
429
430
6
    return true;
431
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4initEv
Unexecuted instantiation: _ZN5doris10ForDecoderIoE4initEv
432
433
// todo(kks): improve this method by SIMD instructions
434
435
template <typename T>
436
template <typename U>
437
void ForDecoder<T>::bit_unpack_optimize(const uint8_t* input, uint8_t in_num, int bit_width,
438
162k
                                        T* output) {
439
162k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
162k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
162k
    constexpr int u_size = sizeof(U);                   // Size of U
442
162k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
162k
    int valid_bit = 0;                                  // How many valid bits
444
162k
    int need_bit = 0;                                   // still need
445
162k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
162k
    int full_batch_size =
447
162k
            cast_set<int>((input_size >> u_size_shift)
448
162k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
162k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
162k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
162k
    T output_mask;
455
162k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
162k
    } else {
458
162k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
162k
    }
460
461
162k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
8.96M
    for (int i = 0; i < full_batch_size; i += u_size) {
464
8.80M
        s = 0;
465
466
8.80M
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
8.80M
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
8.80M
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
43.5k
            valid_bit -= more_bit;
474
43.5k
            s >>= more_bit;
475
43.5k
        }
476
477
8.80M
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
8.18M
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
8.18M
            auto shifted = s >> (valid_bit - need_bit);
484
8.18M
            auto masked_result = shifted & mask;
485
8.18M
            if constexpr (sizeof(T) <= 4) {
486
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
8.18M
            } else {
488
8.18M
                *output |= static_cast<T>(masked_result);
489
8.18M
            }
490
8.18M
            output++;
491
8.18M
            valid_bit -= need_bit;
492
8.18M
        }
493
494
8.80M
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
8.80M
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
17.0M
        for (int j = 0; j < num; j++) {
502
8.22M
            *output =
503
8.22M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
8.22M
            output++;
505
8.22M
        }
506
507
8.80M
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
8.28M
            if constexpr (sizeof(T) <= 4) {
512
0
                auto masked_value = static_cast<T>(
513
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
8.28M
            } else {
516
8.28M
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
8.28M
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
8.28M
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
8.28M
            need_bit = bit_width - remainder;
521
8.28M
        } else {
522
515k
            need_bit = 0;
523
515k
        }
524
8.80M
    }
525
526
    // remainder
527
162k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
954k
        for (int i = 0; i < tail_count; i++) {
530
835k
            s <<= 8;
531
835k
            s |= input[full_batch_size + i];
532
835k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
118k
        valid_bit = (tail_count << 3) - more_bit;
537
118k
        s >>= more_bit;
538
539
        // same as before
540
118k
        if (need_bit) {
541
108k
            if constexpr (sizeof(T) <= 4) {
542
0
                *output |= static_cast<T>(static_cast<uint32_t>(
543
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
108k
            } else {
545
108k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
108k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
108k
            }
548
108k
            output++;
549
108k
            valid_bit -= need_bit;
550
108k
        }
551
552
118k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
253k
        for (int j = 0; j < num; j++) {
556
134k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
134k
            output++;
558
134k
        }
559
118k
    }
560
162k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeIlEEvPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeInEEvPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeIlEEvPKhhiPs
Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeInEEvPKhhiPs
_ZN5doris10ForDecoderIiE19bit_unpack_optimizeIlEEvPKhhiPi
Line
Count
Source
438
18
                                        T* output) {
439
18
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
18
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
18
    constexpr int u_size = sizeof(U);                   // Size of U
442
18
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
18
    int valid_bit = 0;                                  // How many valid bits
444
18
    int need_bit = 0;                                   // still need
445
18
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
18
    int full_batch_size =
447
18
            cast_set<int>((input_size >> u_size_shift)
448
18
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
18
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
18
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
18
    T output_mask;
455
18
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
18
    } else {
458
18
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
18
    }
460
461
18
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
42
    for (int i = 0; i < full_batch_size; i += u_size) {
464
24
        s = 0;
465
466
24
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
24
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
24
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
14
            valid_bit -= more_bit;
474
14
            s >>= more_bit;
475
14
        }
476
477
24
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
0
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
0
            auto shifted = s >> (valid_bit - need_bit);
484
0
            auto masked_result = shifted & mask;
485
0
            if constexpr (sizeof(T) <= 4) {
486
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
            } else {
488
                *output |= static_cast<T>(masked_result);
489
            }
490
0
            output++;
491
0
            valid_bit -= need_bit;
492
0
        }
493
494
24
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
24
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
1.56k
        for (int j = 0; j < num; j++) {
502
1.53k
            *output =
503
1.53k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
1.53k
            output++;
505
1.53k
        }
506
507
24
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
0
            if constexpr (sizeof(T) <= 4) {
512
0
                auto masked_value = static_cast<T>(
513
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
            } else {
516
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
0
            need_bit = bit_width - remainder;
521
24
        } else {
522
24
            need_bit = 0;
523
24
        }
524
24
    }
525
526
    // remainder
527
18
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
4
        for (int i = 0; i < tail_count; i++) {
530
2
            s <<= 8;
531
2
            s |= input[full_batch_size + i];
532
2
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
2
        valid_bit = (tail_count << 3) - more_bit;
537
2
        s >>= more_bit;
538
539
        // same as before
540
2
        if (need_bit) {
541
0
            if constexpr (sizeof(T) <= 4) {
542
0
                *output |= static_cast<T>(static_cast<uint32_t>(
543
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
            } else {
545
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
                                          ((static_cast<U>(1) << need_bit) - 1));
547
            }
548
0
            output++;
549
0
            valid_bit -= need_bit;
550
0
        }
551
552
2
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
6
        for (int j = 0; j < num; j++) {
556
4
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
4
            output++;
558
4
        }
559
2
    }
560
18
}
Unexecuted instantiation: _ZN5doris10ForDecoderIiE19bit_unpack_optimizeInEEvPKhhiPi
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeIlEEvPKhhiPl
Line
Count
Source
438
24.6k
                                        T* output) {
439
24.6k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
24.6k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
24.6k
    constexpr int u_size = sizeof(U);                   // Size of U
442
24.6k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
24.6k
    int valid_bit = 0;                                  // How many valid bits
444
24.6k
    int need_bit = 0;                                   // still need
445
24.6k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
24.6k
    int full_batch_size =
447
24.6k
            cast_set<int>((input_size >> u_size_shift)
448
24.6k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
24.6k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
24.6k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
24.6k
    T output_mask;
455
24.6k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
24.6k
    } else {
458
24.6k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
24.6k
    }
460
461
24.6k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
556k
    for (int i = 0; i < full_batch_size; i += u_size) {
464
532k
        s = 0;
465
466
532k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
532k
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
532k
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
10.3k
            valid_bit -= more_bit;
474
10.3k
            s >>= more_bit;
475
10.3k
        }
476
477
532k
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
414k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
414k
            auto shifted = s >> (valid_bit - need_bit);
484
414k
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
414k
            } else {
488
414k
                *output |= static_cast<T>(masked_result);
489
414k
            }
490
414k
            output++;
491
414k
            valid_bit -= need_bit;
492
414k
        }
493
494
532k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
532k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
2.15M
        for (int j = 0; j < num; j++) {
502
1.61M
            *output =
503
1.61M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
1.61M
            output++;
505
1.61M
        }
506
507
532k
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
424k
            } else {
516
424k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
424k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
424k
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
424k
            need_bit = bit_width - remainder;
521
424k
        } else {
522
107k
            need_bit = 0;
523
107k
        }
524
532k
    }
525
526
    // remainder
527
24.6k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
70.0k
        for (int i = 0; i < tail_count; i++) {
530
56.0k
            s <<= 8;
531
56.0k
            s |= input[full_batch_size + i];
532
56.0k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
13.9k
        valid_bit = (tail_count << 3) - more_bit;
537
13.9k
        s >>= more_bit;
538
539
        // same as before
540
13.9k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
10.2k
            } else {
545
10.2k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
10.2k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
10.2k
            }
548
10.2k
            output++;
549
10.2k
            valid_bit -= need_bit;
550
10.2k
        }
551
552
13.9k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
61.3k
        for (int j = 0; j < num; j++) {
556
47.3k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
47.3k
            output++;
558
47.3k
        }
559
13.9k
    }
560
24.6k
}
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeInEEvPKhhiPl
Line
Count
Source
438
24.3k
                                        T* output) {
439
24.3k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
24.3k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
24.3k
    constexpr int u_size = sizeof(U);                   // Size of U
442
24.3k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
24.3k
    int valid_bit = 0;                                  // How many valid bits
444
24.3k
    int need_bit = 0;                                   // still need
445
24.3k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
24.3k
    int full_batch_size =
447
24.3k
            cast_set<int>((input_size >> u_size_shift)
448
24.3k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
24.3k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
24.3k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
24.3k
    T output_mask;
455
24.3k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
24.3k
    } else {
458
24.3k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
24.3k
    }
460
461
24.3k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
807k
    for (int i = 0; i < full_batch_size; i += u_size) {
464
783k
        s = 0;
465
466
783k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
783k
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
783k
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
9.10k
            valid_bit -= more_bit;
474
9.10k
            s >>= more_bit;
475
9.10k
        }
476
477
783k
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
735k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
735k
            auto shifted = s >> (valid_bit - need_bit);
484
735k
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
735k
            } else {
488
735k
                *output |= static_cast<T>(masked_result);
489
735k
            }
490
735k
            output++;
491
735k
            valid_bit -= need_bit;
492
735k
        }
493
494
783k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
783k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
2.10M
        for (int j = 0; j < num; j++) {
502
1.32M
            *output =
503
1.32M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
1.32M
            output++;
505
1.32M
        }
506
507
783k
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
749k
            } else {
516
749k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
749k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
749k
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
749k
            need_bit = bit_width - remainder;
521
749k
        } else {
522
33.9k
            need_bit = 0;
523
33.9k
        }
524
783k
    }
525
526
    // remainder
527
24.3k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
137k
        for (int i = 0; i < tail_count; i++) {
530
121k
            s <<= 8;
531
121k
            s |= input[full_batch_size + i];
532
121k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
15.2k
        valid_bit = (tail_count << 3) - more_bit;
537
15.2k
        s >>= more_bit;
538
539
        // same as before
540
15.2k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
14.6k
            } else {
545
14.6k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
14.6k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
14.6k
            }
548
14.6k
            output++;
549
14.6k
            valid_bit -= need_bit;
550
14.6k
        }
551
552
15.2k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
28.2k
        for (int j = 0; j < num; j++) {
556
13.0k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
13.0k
            output++;
558
13.0k
        }
559
15.2k
    }
560
24.3k
}
_ZN5doris10ForDecoderInE19bit_unpack_optimizeIlEEvPKhhiPn
Line
Count
Source
438
16.5k
                                        T* output) {
439
16.5k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
16.5k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
16.5k
    constexpr int u_size = sizeof(U);                   // Size of U
442
16.5k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
16.5k
    int valid_bit = 0;                                  // How many valid bits
444
16.5k
    int need_bit = 0;                                   // still need
445
16.5k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
16.5k
    int full_batch_size =
447
16.5k
            cast_set<int>((input_size >> u_size_shift)
448
16.5k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
16.5k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
16.5k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
16.5k
    T output_mask;
455
16.5k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
16.5k
    } else {
458
16.5k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
16.5k
    }
460
461
16.5k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
548k
    for (int i = 0; i < full_batch_size; i += u_size) {
464
532k
        s = 0;
465
466
532k
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
532k
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
532k
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
2.24k
            valid_bit -= more_bit;
474
2.24k
            s >>= more_bit;
475
2.24k
        }
476
477
532k
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
414k
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
414k
            auto shifted = s >> (valid_bit - need_bit);
484
414k
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
414k
            } else {
488
414k
                *output |= static_cast<T>(masked_result);
489
414k
            }
490
414k
            output++;
491
414k
            valid_bit -= need_bit;
492
414k
        }
493
494
532k
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
532k
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
2.14M
        for (int j = 0; j < num; j++) {
502
1.61M
            *output =
503
1.61M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
1.61M
            output++;
505
1.61M
        }
506
507
532k
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
424k
            } else {
516
424k
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
424k
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
424k
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
424k
            need_bit = bit_width - remainder;
521
424k
        } else {
522
107k
            need_bit = 0;
523
107k
        }
524
532k
    }
525
526
    // remainder
527
16.5k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
70.4k
        for (int i = 0; i < tail_count; i++) {
530
56.3k
            s <<= 8;
531
56.3k
            s |= input[full_batch_size + i];
532
56.3k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
14.0k
        valid_bit = (tail_count << 3) - more_bit;
537
14.0k
        s >>= more_bit;
538
539
        // same as before
540
14.0k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
10.2k
            } else {
545
10.2k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
10.2k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
10.2k
            }
548
10.2k
            output++;
549
10.2k
            valid_bit -= need_bit;
550
10.2k
        }
551
552
14.0k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
61.5k
        for (int j = 0; j < num; j++) {
556
47.4k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
47.4k
            output++;
558
47.4k
        }
559
14.0k
    }
560
16.5k
}
_ZN5doris10ForDecoderInE19bit_unpack_optimizeInEEvPKhhiPn
Line
Count
Source
438
97.0k
                                        T* output) {
439
97.0k
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
97.0k
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
97.0k
    constexpr int u_size = sizeof(U);                   // Size of U
442
97.0k
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
97.0k
    int valid_bit = 0;                                  // How many valid bits
444
97.0k
    int need_bit = 0;                                   // still need
445
97.0k
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
97.0k
    int full_batch_size =
447
97.0k
            cast_set<int>((input_size >> u_size_shift)
448
97.0k
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
97.0k
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
97.0k
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
97.0k
    T output_mask;
455
97.0k
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
97.0k
    } else {
458
97.0k
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
97.0k
    }
460
461
97.0k
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
7.05M
    for (int i = 0; i < full_batch_size; i += u_size) {
464
6.95M
        s = 0;
465
466
6.95M
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
6.95M
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
6.95M
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
21.7k
            valid_bit -= more_bit;
474
21.7k
            s >>= more_bit;
475
21.7k
        }
476
477
6.95M
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
6.61M
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
6.61M
            auto shifted = s >> (valid_bit - need_bit);
484
6.61M
            auto masked_result = shifted & mask;
485
            if constexpr (sizeof(T) <= 4) {
486
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
6.61M
            } else {
488
6.61M
                *output |= static_cast<T>(masked_result);
489
6.61M
            }
490
6.61M
            output++;
491
6.61M
            valid_bit -= need_bit;
492
6.61M
        }
493
494
6.95M
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
6.95M
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
10.6M
        for (int j = 0; j < num; j++) {
502
3.66M
            *output =
503
3.66M
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
3.66M
            output++;
505
3.66M
        }
506
507
6.95M
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
            if constexpr (sizeof(T) <= 4) {
512
                auto masked_value = static_cast<T>(
513
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
6.69M
            } else {
516
6.69M
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
6.69M
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
6.69M
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
6.69M
            need_bit = bit_width - remainder;
521
6.69M
        } else {
522
265k
            need_bit = 0;
523
265k
        }
524
6.95M
    }
525
526
    // remainder
527
97.0k
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
676k
        for (int i = 0; i < tail_count; i++) {
530
601k
            s <<= 8;
531
601k
            s |= input[full_batch_size + i];
532
601k
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
75.2k
        valid_bit = (tail_count << 3) - more_bit;
537
75.2k
        s >>= more_bit;
538
539
        // same as before
540
75.2k
        if (need_bit) {
541
            if constexpr (sizeof(T) <= 4) {
542
                *output |= static_cast<T>(static_cast<uint32_t>(
543
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
72.8k
            } else {
545
72.8k
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
72.8k
                                          ((static_cast<U>(1) << need_bit) - 1));
547
72.8k
            }
548
72.8k
            output++;
549
72.8k
            valid_bit -= need_bit;
550
72.8k
        }
551
552
75.2k
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
101k
        for (int j = 0; j < num; j++) {
556
26.6k
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
26.6k
            output++;
558
26.6k
        }
559
75.2k
    }
560
97.0k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeIlEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeInEEvPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeIlEEvPKhhiPt
Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeInEEvPKhhiPt
_ZN5doris10ForDecoderIjE19bit_unpack_optimizeIlEEvPKhhiPj
Line
Count
Source
438
10
                                        T* output) {
439
10
    static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value,
440
10
                  "bit_unpack_optimize only supports U = int64_t or __int128_t");
441
10
    constexpr int u_size = sizeof(U);                   // Size of U
442
10
    constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size)
443
10
    int valid_bit = 0;                                  // How many valid bits
444
10
    int need_bit = 0;                                   // still need
445
10
    size_t input_size = (in_num * bit_width + 7) >> 3;  // input's size
446
10
    int full_batch_size =
447
10
            cast_set<int>((input_size >> u_size_shift)
448
10
                          << u_size_shift);     // Adjust input_size to a multiple of u_size
449
10
    int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size.
450
    // The number of bits in input to adjust to multiples of 8 and thus more
451
10
    int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width));
452
453
    // to ensure that only bit_width bits are valid
454
10
    T output_mask;
455
10
    if (bit_width >= static_cast<int>(sizeof(T) * 8)) {
456
0
        output_mask = static_cast<T>(~T(0));
457
10
    } else {
458
10
        output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1);
459
10
    }
460
461
10
    U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking
462
463
30
    for (int i = 0; i < full_batch_size; i += u_size) {
464
20
        s = 0;
465
466
20
        s = to_endian<std::endian::big>(*((U*)(input + i)));
467
468
        // Determine what the valid bits are based on u_size
469
20
        valid_bit = u_size << 3;
470
471
        // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop.
472
20
        if (tail_count == 0 && i == full_batch_size - u_size) {
473
10
            valid_bit -= more_bit;
474
10
            s >>= more_bit;
475
10
        }
476
477
20
        if (need_bit) {
478
            // The last time we take away the high bit_width - need_bit,
479
            // we need to make up the rest of the need_bit from the width.
480
            // Use valid_bit - need_bit to compute high need_bit bits of s
481
            // perform an AND operation to ensure that only need_bit bits are valid
482
0
            auto mask = (static_cast<U>(1) << need_bit) - 1;
483
0
            auto shifted = s >> (valid_bit - need_bit);
484
0
            auto masked_result = shifted & mask;
485
0
            if constexpr (sizeof(T) <= 4) {
486
0
                *output |= static_cast<T>(static_cast<uint32_t>(masked_result));
487
            } else {
488
                *output |= static_cast<T>(masked_result);
489
            }
490
0
            output++;
491
0
            valid_bit -= need_bit;
492
0
        }
493
494
20
        int num = valid_bit / bit_width;             // How many outputs can be processed at a time
495
20
        int remainder = valid_bit - num * bit_width; // How many bits are left to store
496
497
        // Starting with the highest valid bit, take out bit_width bits in sequence
498
        // perform an AND operation with output_mask to ensure that only bit_width bits are valid
499
        // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end
500
        // But since there are still remainder bits that can't be processed, need to add the remainder
501
1.30k
        for (int j = 0; j < num; j++) {
502
1.28k
            *output =
503
1.28k
                    static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask);
504
1.28k
            output++;
505
1.28k
        }
506
507
20
        if (remainder) {
508
            // Process the last remaining remainder bit.
509
            // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits.
510
            // output = y << (bit_width - remainder) Use the high bit_width - remainder bit
511
0
            if constexpr (sizeof(T) <= 4) {
512
0
                auto masked_value = static_cast<T>(
513
0
                        static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1)));
514
0
                *output = static_cast<T>(masked_value << (bit_width - remainder));
515
            } else {
516
                auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1)));
517
                *output = static_cast<T>(masked_value << (bit_width - remainder));
518
            }
519
            // Already have remainder bits, next time need bit_width - remainder bits
520
0
            need_bit = bit_width - remainder;
521
20
        } else {
522
20
            need_bit = 0;
523
20
        }
524
20
    }
525
526
    // remainder
527
10
    if (tail_count) {
528
        // Put the tail_count numbers in the input into s in order, each number occupies 8 bit
529
0
        for (int i = 0; i < tail_count; i++) {
530
0
            s <<= 8;
531
0
            s |= input[full_batch_size + i];
532
0
        }
533
534
        // tail * 8 is the number of bits that are left to process
535
        // tail * 8 - more_bit is to remove the last more_bit
536
0
        valid_bit = (tail_count << 3) - more_bit;
537
0
        s >>= more_bit;
538
539
        // same as before
540
0
        if (need_bit) {
541
0
            if constexpr (sizeof(T) <= 4) {
542
0
                *output |= static_cast<T>(static_cast<uint32_t>(
543
0
                        (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1)));
544
            } else {
545
                *output |= static_cast<T>((s >> (valid_bit - need_bit)) &
546
                                          ((static_cast<U>(1) << need_bit) - 1));
547
            }
548
0
            output++;
549
0
            valid_bit -= need_bit;
550
0
        }
551
552
0
        int num = valid_bit / bit_width; // How many outputs can be processed at a time
553
554
        // same as before
555
0
        for (int j = 0; j < num; j++) {
556
0
            *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask);
557
0
            output++;
558
0
        }
559
0
    }
560
10
}
Unexecuted instantiation: _ZN5doris10ForDecoderIjE19bit_unpack_optimizeInEEvPKhhiPj
Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeIlEEvPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeInEEvPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeIlEEvPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeInEEvPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeIlEEvPKhhiPo
Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeInEEvPKhhiPo
561
562
// The reverse of bit_pack method, get original integer data list from packed bits
563
// param[in] input: the packed bits need to unpack
564
// param[in] in_num: the integer number in packed bits
565
// param[in] bit_width: how many bit we used to store each integer data
566
// param[out] output: the original integer data list
567
template <typename T>
568
162k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
162k
    if (bit_width <= 32) {
574
41.2k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
121k
    } else {
576
121k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
121k
    }
578
162k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE10bit_unpackEPKhhiPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE10bit_unpackEPKhhiPs
_ZN5doris10ForDecoderIiE10bit_unpackEPKhhiPi
Line
Count
Source
568
18
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
18
    if (bit_width <= 32) {
574
18
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
18
    } else {
576
0
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
0
    }
578
18
}
_ZN5doris10ForDecoderIlE10bit_unpackEPKhhiPl
Line
Count
Source
568
48.9k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
48.9k
    if (bit_width <= 32) {
574
24.6k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
24.6k
    } else {
576
24.3k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
24.3k
    }
578
48.9k
}
_ZN5doris10ForDecoderInE10bit_unpackEPKhhiPn
Line
Count
Source
568
113k
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
113k
    if (bit_width <= 32) {
574
16.5k
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
97.0k
    } else {
576
97.0k
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
97.0k
    }
578
113k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10bit_unpackEPKhhiPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE10bit_unpackEPKhhiPt
_ZN5doris10ForDecoderIjE10bit_unpackEPKhhiPj
Line
Count
Source
568
10
void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) {
569
    /*
570
        When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times.
571
        When bit_width > 64, we must use __int128_t and unroll the loop 16 times.
572
    */
573
10
    if (bit_width <= 32) {
574
10
        bit_unpack_optimize<int64_t>(input, in_num, bit_width, output);
575
10
    } else {
576
0
        bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output);
577
0
    }
578
10
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE10bit_unpackEPKhhiPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10bit_unpackEPKhhiPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE10bit_unpackEPKhhiPo
579
580
template <typename T>
581
8.35M
void ForDecoder<T>::decode_current_frame(T* output) {
582
8.35M
    uint32_t frame_index = _current_index / _max_frame_size;
583
8.35M
    if (frame_index == _current_decoded_frame) {
584
8.25M
        return; // current frame already decoded
585
8.25M
    }
586
97.8k
    _current_decoded_frame = frame_index;
587
97.8k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
97.8k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
97.8k
    T min = 0;
591
97.8k
    uint32_t delta_offset = 0;
592
97.8k
    if constexpr (sizeof(T) == 16) {
593
48.8k
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
48.8k
        delta_offset = base_offset + 16;
595
48.9k
    } else if constexpr (sizeof(T) == 8) {
596
48.9k
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
48.9k
        delta_offset = base_offset + 8;
598
48.9k
    } else {
599
28
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
28
        delta_offset = base_offset + 4;
601
28
    }
602
603
97.8k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
97.8k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
97.8k
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
97.8k
    } else {
609
97.8k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
97.8k
        std::vector<T> delta_values(current_frame_size);
611
97.8k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
97.8k
        if (is_ascending) {
613
902
            T pre_value = min;
614
7.74k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
6.84k
                T value = delta_values[i] + pre_value;
616
6.84k
                output[i] = value;
617
6.84k
                pre_value = value;
618
6.84k
            }
619
96.9k
        } else {
620
8.45M
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
8.35M
                output[i] = delta_values[i] + min;
622
8.35M
            }
623
96.9k
        }
624
97.8k
    }
625
97.8k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE20decode_current_frameEPa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE20decode_current_frameEPs
_ZN5doris10ForDecoderIiE20decode_current_frameEPi
Line
Count
Source
581
20
void ForDecoder<T>::decode_current_frame(T* output) {
582
20
    uint32_t frame_index = _current_index / _max_frame_size;
583
20
    if (frame_index == _current_decoded_frame) {
584
2
        return; // current frame already decoded
585
2
    }
586
18
    _current_decoded_frame = frame_index;
587
18
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
18
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
18
    T min = 0;
591
18
    uint32_t delta_offset = 0;
592
    if constexpr (sizeof(T) == 16) {
593
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
        delta_offset = base_offset + 16;
595
    } else if constexpr (sizeof(T) == 8) {
596
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
        delta_offset = base_offset + 8;
598
18
    } else {
599
18
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
18
        delta_offset = base_offset + 4;
601
18
    }
602
603
18
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
18
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
18
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
18
    } else {
609
18
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
18
        std::vector<T> delta_values(current_frame_size);
611
18
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
18
        if (is_ascending) {
613
18
            T pre_value = min;
614
1.56k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
1.54k
                T value = delta_values[i] + pre_value;
616
1.54k
                output[i] = value;
617
1.54k
                pre_value = value;
618
1.54k
            }
619
18
        } else {
620
0
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
0
                output[i] = delta_values[i] + min;
622
0
            }
623
0
        }
624
18
    }
625
18
}
_ZN5doris10ForDecoderIlE20decode_current_frameEPl
Line
Count
Source
581
4.17M
void ForDecoder<T>::decode_current_frame(T* output) {
582
4.17M
    uint32_t frame_index = _current_index / _max_frame_size;
583
4.17M
    if (frame_index == _current_decoded_frame) {
584
4.12M
        return; // current frame already decoded
585
4.12M
    }
586
48.9k
    _current_decoded_frame = frame_index;
587
48.9k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
48.9k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
48.9k
    T min = 0;
591
48.9k
    uint32_t delta_offset = 0;
592
    if constexpr (sizeof(T) == 16) {
593
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
        delta_offset = base_offset + 16;
595
48.9k
    } else if constexpr (sizeof(T) == 8) {
596
48.9k
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
48.9k
        delta_offset = base_offset + 8;
598
    } else {
599
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
        delta_offset = base_offset + 4;
601
    }
602
603
48.9k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
48.9k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
48.9k
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
48.9k
    } else {
609
48.9k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
48.9k
        std::vector<T> delta_values(current_frame_size);
611
48.9k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
48.9k
        if (is_ascending) {
613
446
            T pre_value = min;
614
3.79k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
3.34k
                T value = delta_values[i] + pre_value;
616
3.34k
                output[i] = value;
617
3.34k
                pre_value = value;
618
3.34k
            }
619
48.4k
        } else {
620
4.22M
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
4.17M
                output[i] = delta_values[i] + min;
622
4.17M
            }
623
48.4k
        }
624
48.9k
    }
625
48.9k
}
_ZN5doris10ForDecoderInE20decode_current_frameEPn
Line
Count
Source
581
4.17M
void ForDecoder<T>::decode_current_frame(T* output) {
582
4.17M
    uint32_t frame_index = _current_index / _max_frame_size;
583
4.17M
    if (frame_index == _current_decoded_frame) {
584
4.12M
        return; // current frame already decoded
585
4.12M
    }
586
48.8k
    _current_decoded_frame = frame_index;
587
48.8k
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
48.8k
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
48.8k
    T min = 0;
591
48.8k
    uint32_t delta_offset = 0;
592
48.8k
    if constexpr (sizeof(T) == 16) {
593
48.8k
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
48.8k
        delta_offset = base_offset + 16;
595
    } else if constexpr (sizeof(T) == 8) {
596
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
        delta_offset = base_offset + 8;
598
    } else {
599
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
        delta_offset = base_offset + 4;
601
    }
602
603
48.8k
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
48.8k
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
48.8k
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
48.8k
    } else {
609
48.8k
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
48.8k
        std::vector<T> delta_values(current_frame_size);
611
48.8k
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
48.8k
        if (is_ascending) {
613
428
            T pre_value = min;
614
1.10k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
676
                T value = delta_values[i] + pre_value;
616
676
                output[i] = value;
617
676
                pre_value = value;
618
676
            }
619
48.4k
        } else {
620
4.22M
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
4.17M
                output[i] = delta_values[i] + min;
622
4.17M
            }
623
48.4k
        }
624
48.8k
    }
625
48.8k
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE20decode_current_frameEPh
Unexecuted instantiation: _ZN5doris10ForDecoderItE20decode_current_frameEPt
_ZN5doris10ForDecoderIjE20decode_current_frameEPj
Line
Count
Source
581
10
void ForDecoder<T>::decode_current_frame(T* output) {
582
10
    uint32_t frame_index = _current_index / _max_frame_size;
583
10
    if (frame_index == _current_decoded_frame) {
584
0
        return; // current frame already decoded
585
0
    }
586
10
    _current_decoded_frame = frame_index;
587
10
    uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index));
588
589
10
    uint32_t base_offset = _frame_offsets[_current_decoded_frame];
590
10
    T min = 0;
591
10
    uint32_t delta_offset = 0;
592
    if constexpr (sizeof(T) == 16) {
593
        min = static_cast<T>(decode_fixed128_le(_buffer + base_offset));
594
        delta_offset = base_offset + 16;
595
    } else if constexpr (sizeof(T) == 8) {
596
        min = static_cast<T>(decode_fixed64_le(_buffer + base_offset));
597
        delta_offset = base_offset + 8;
598
10
    } else {
599
10
        min = static_cast<T>(decode_fixed32_le(_buffer + base_offset));
600
10
        delta_offset = base_offset + 4;
601
10
    }
602
603
10
    uint8_t bit_width = _bit_widths[_current_decoded_frame];
604
605
10
    bool is_original_value = _storage_formats[_current_decoded_frame] == 2;
606
10
    if (is_original_value) {
607
0
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output);
608
10
    } else {
609
10
        bool is_ascending = _storage_formats[_current_decoded_frame] == 1;
610
10
        std::vector<T> delta_values(current_frame_size);
611
10
        bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data());
612
10
        if (is_ascending) {
613
10
            T pre_value = min;
614
1.29k
            for (uint8_t i = 0; i < current_frame_size; i++) {
615
1.28k
                T value = delta_values[i] + pre_value;
616
1.28k
                output[i] = value;
617
1.28k
                pre_value = value;
618
1.28k
            }
619
10
        } else {
620
0
            for (uint8_t i = 0; i < current_frame_size; i++) {
621
0
                output[i] = delta_values[i] + min;
622
0
            }
623
0
        }
624
10
    }
625
10
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE20decode_current_frameEPm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE20decode_current_frameEPS1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE20decode_current_frameEPo
626
627
template <typename T>
628
24
T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) {
629
24
    uint32_t min_offset = _frame_offsets[frame_index];
630
24
    T min = 0;
631
24
    if constexpr (sizeof(T) == 16) {
632
0
        min = static_cast<T>(decode_fixed128_le(_buffer + min_offset));
633
24
    } else if constexpr (sizeof(T) == 8) {
634
24
        min = static_cast<T>(decode_fixed64_le(_buffer + min_offset));
635
24
    } else {
636
0
        min = static_cast<T>(decode_fixed32_le(_buffer + min_offset));
637
0
    }
638
24
    return min;
639
24
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIsE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIiE22decode_frame_min_valueEj
_ZN5doris10ForDecoderIlE22decode_frame_min_valueEj
Line
Count
Source
628
24
T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) {
629
24
    uint32_t min_offset = _frame_offsets[frame_index];
630
24
    T min = 0;
631
    if constexpr (sizeof(T) == 16) {
632
        min = static_cast<T>(decode_fixed128_le(_buffer + min_offset));
633
24
    } else if constexpr (sizeof(T) == 8) {
634
24
        min = static_cast<T>(decode_fixed64_le(_buffer + min_offset));
635
    } else {
636
        min = static_cast<T>(decode_fixed32_le(_buffer + min_offset));
637
    }
638
24
    return min;
639
24
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIhE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderItE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIjE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderImE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22decode_frame_min_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderIoE22decode_frame_min_valueEj
640
641
template <typename T>
642
8.35M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
8.35M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
8.35M
    _current_index += count;
645
8.35M
    val += count;
646
8.35M
    return val;
647
8.35M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE10copy_valueEPam
Unexecuted instantiation: _ZN5doris10ForDecoderIsE10copy_valueEPsm
_ZN5doris10ForDecoderIiE10copy_valueEPim
Line
Count
Source
642
16
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
16
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
16
    _current_index += count;
645
16
    val += count;
646
16
    return val;
647
16
}
_ZN5doris10ForDecoderIlE10copy_valueEPlm
Line
Count
Source
642
4.17M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
4.17M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
4.17M
    _current_index += count;
645
4.17M
    val += count;
646
4.17M
    return val;
647
4.17M
}
_ZN5doris10ForDecoderInE10copy_valueEPnm
Line
Count
Source
642
4.17M
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
4.17M
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
4.17M
    _current_index += count;
645
4.17M
    val += count;
646
4.17M
    return val;
647
4.17M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10copy_valueEPhm
Unexecuted instantiation: _ZN5doris10ForDecoderItE10copy_valueEPtm
_ZN5doris10ForDecoderIjE10copy_valueEPjm
Line
Count
Source
642
6
T* ForDecoder<T>::copy_value(T* val, size_t count) {
643
6
    memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count);
644
6
    _current_index += count;
645
6
    val += count;
646
6
    return val;
647
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE10copy_valueEPmm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10copy_valueEPS1_m
Unexecuted instantiation: _ZN5doris10ForDecoderIoE10copy_valueEPom
648
649
template <typename T>
650
8.35M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
8.35M
    if (_current_index + count > _values_num) {
652
2
        return false;
653
2
    }
654
655
8.35M
    decode_current_frame(_out_buffer.data());
656
657
8.35M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
8.32M
        copy_value(val, count);
659
8.32M
        return true;
660
8.32M
    }
661
662
    // 1. padding one frame
663
32.7k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
32.7k
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
32.7k
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
32.8k
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
14
        decode_current_frame(val);
671
14
        _current_index += _max_frame_size;
672
14
        val += _max_frame_size;
673
14
    }
674
675
    // 3. process remaining value
676
32.7k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
32.7k
    if (remaining_num > 0) {
678
8
        decode_current_frame(_out_buffer.data());
679
8
        val = copy_value(val, remaining_num);
680
8
    }
681
682
32.7k
    return true;
683
8.35M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE9get_batchEPam
Unexecuted instantiation: _ZN5doris10ForDecoderIsE9get_batchEPsm
_ZN5doris10ForDecoderIiE9get_batchEPim
Line
Count
Source
650
16
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
16
    if (_current_index + count > _values_num) {
652
2
        return false;
653
2
    }
654
655
14
    decode_current_frame(_out_buffer.data());
656
657
14
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
8
        copy_value(val, count);
659
8
        return true;
660
8
    }
661
662
    // 1. padding one frame
663
6
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
6
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
6
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
10
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
4
        decode_current_frame(val);
671
4
        _current_index += _max_frame_size;
672
4
        val += _max_frame_size;
673
4
    }
674
675
    // 3. process remaining value
676
6
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
6
    if (remaining_num > 0) {
678
2
        decode_current_frame(_out_buffer.data());
679
2
        val = copy_value(val, remaining_num);
680
2
    }
681
682
6
    return true;
683
14
}
_ZN5doris10ForDecoderIlE9get_batchEPlm
Line
Count
Source
650
4.17M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
4.17M
    if (_current_index + count > _values_num) {
652
0
        return false;
653
0
    }
654
655
4.17M
    decode_current_frame(_out_buffer.data());
656
657
4.17M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
4.16M
        copy_value(val, count);
659
4.16M
        return true;
660
4.16M
    }
661
662
    // 1. padding one frame
663
16.3k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
16.3k
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
16.3k
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
16.3k
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
6
        decode_current_frame(val);
671
6
        _current_index += _max_frame_size;
672
6
        val += _max_frame_size;
673
6
    }
674
675
    // 3. process remaining value
676
16.3k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
16.3k
    if (remaining_num > 0) {
678
6
        decode_current_frame(_out_buffer.data());
679
6
        val = copy_value(val, remaining_num);
680
6
    }
681
682
16.3k
    return true;
683
4.17M
}
_ZN5doris10ForDecoderInE9get_batchEPnm
Line
Count
Source
650
4.17M
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
4.17M
    if (_current_index + count > _values_num) {
652
0
        return false;
653
0
    }
654
655
4.17M
    decode_current_frame(_out_buffer.data());
656
657
4.17M
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
4.16M
        copy_value(val, count);
659
4.16M
        return true;
660
4.16M
    }
661
662
    // 1. padding one frame
663
16.3k
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
16.3k
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
16.3k
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
16.3k
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
0
        decode_current_frame(val);
671
0
        _current_index += _max_frame_size;
672
0
        val += _max_frame_size;
673
0
    }
674
675
    // 3. process remaining value
676
16.3k
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
16.3k
    if (remaining_num > 0) {
678
0
        decode_current_frame(_out_buffer.data());
679
0
        val = copy_value(val, remaining_num);
680
0
    }
681
682
16.3k
    return true;
683
4.17M
}
Unexecuted instantiation: _ZN5doris10ForDecoderIhE9get_batchEPhm
Unexecuted instantiation: _ZN5doris10ForDecoderItE9get_batchEPtm
_ZN5doris10ForDecoderIjE9get_batchEPjm
Line
Count
Source
650
6
bool ForDecoder<T>::get_batch(T* val, size_t count) {
651
6
    if (_current_index + count > _values_num) {
652
0
        return false;
653
0
    }
654
655
6
    decode_current_frame(_out_buffer.data());
656
657
6
    if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) {
658
0
        copy_value(val, count);
659
0
        return true;
660
0
    }
661
662
    // 1. padding one frame
663
6
    size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index;
664
6
    val = copy_value(val, padding_num);
665
666
    // 2. process frame by frame
667
6
    size_t frame_count = (count - padding_num) / _max_frame_size;
668
10
    for (size_t i = 0; i < frame_count; i++) {
669
        // directly decode value to the output, don't  buffer the value
670
4
        decode_current_frame(val);
671
4
        _current_index += _max_frame_size;
672
4
        val += _max_frame_size;
673
4
    }
674
675
    // 3. process remaining value
676
6
    size_t remaining_num = (count - padding_num) % _max_frame_size;
677
6
    if (remaining_num > 0) {
678
0
        decode_current_frame(_out_buffer.data());
679
0
        val = copy_value(val, remaining_num);
680
0
    }
681
682
6
    return true;
683
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE9get_batchEPmm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE9get_batchEPS1_m
Unexecuted instantiation: _ZN5doris10ForDecoderIoE9get_batchEPom
684
685
template <typename T>
686
6
bool ForDecoder<T>::skip(int32_t skip_num) {
687
6
    if (_current_index + skip_num >= _values_num) {
688
0
        return false;
689
0
    }
690
6
    _current_index = _current_index + skip_num;
691
6
    return true;
692
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIsE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIiE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIlE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderInE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderItE4skipEi
_ZN5doris10ForDecoderIjE4skipEi
Line
Count
Source
686
6
bool ForDecoder<T>::skip(int32_t skip_num) {
687
6
    if (_current_index + skip_num >= _values_num) {
688
0
        return false;
689
0
    }
690
6
    _current_index = _current_index + skip_num;
691
6
    return true;
692
6
}
Unexecuted instantiation: _ZN5doris10ForDecoderImE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4skipEi
Unexecuted instantiation: _ZN5doris10ForDecoderIoE4skipEi
693
694
template <typename T>
695
12
uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) {
696
    // first of all, find the first frame >= target
697
12
    uint32_t left = 0;
698
12
    uint32_t right = _frame_count;
699
36
    while (left < right) {
700
24
        uint32_t mid = left + (right - left) / 2;
701
24
        T midValue = decode_frame_min_value(mid);
702
24
        if (midValue < target) {
703
12
            left = mid + 1;
704
12
        } else {
705
12
            right = mid;
706
12
        }
707
24
    }
708
    // after loop, left is the first frame >= target
709
12
    if (left == 0) {
710
        // all frames are >= target, not found
711
4
        return _frame_count;
712
4
    }
713
    // otherwise previous frame is the last frame < target
714
8
    return left - 1;
715
12
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE28seek_last_frame_before_valueEa
Unexecuted instantiation: _ZN5doris10ForDecoderIsE28seek_last_frame_before_valueEs
Unexecuted instantiation: _ZN5doris10ForDecoderIiE28seek_last_frame_before_valueEi
_ZN5doris10ForDecoderIlE28seek_last_frame_before_valueEl
Line
Count
Source
695
12
uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) {
696
    // first of all, find the first frame >= target
697
12
    uint32_t left = 0;
698
12
    uint32_t right = _frame_count;
699
36
    while (left < right) {
700
24
        uint32_t mid = left + (right - left) / 2;
701
24
        T midValue = decode_frame_min_value(mid);
702
24
        if (midValue < target) {
703
12
            left = mid + 1;
704
12
        } else {
705
12
            right = mid;
706
12
        }
707
24
    }
708
    // after loop, left is the first frame >= target
709
12
    if (left == 0) {
710
        // all frames are >= target, not found
711
4
        return _frame_count;
712
4
    }
713
    // otherwise previous frame is the last frame < target
714
8
    return left - 1;
715
12
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE28seek_last_frame_before_valueEn
Unexecuted instantiation: _ZN5doris10ForDecoderIhE28seek_last_frame_before_valueEh
Unexecuted instantiation: _ZN5doris10ForDecoderItE28seek_last_frame_before_valueEt
Unexecuted instantiation: _ZN5doris10ForDecoderIjE28seek_last_frame_before_valueEj
Unexecuted instantiation: _ZN5doris10ForDecoderImE28seek_last_frame_before_valueEm
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE28seek_last_frame_before_valueES1_
Unexecuted instantiation: _ZN5doris10ForDecoderIoE28seek_last_frame_before_valueEo
716
717
template <typename T>
718
bool ForDecoder<T>::seek_lower_bound_inside_frame(uint32_t frame_index, T target,
719
8
                                                  bool* exact_match) {
720
8
    _current_index = frame_index * _max_frame_size;
721
8
    decode_current_frame(_out_buffer.data());
722
8
    auto end = _out_buffer.begin() + frame_size(frame_index);
723
8
    auto pos = std::lower_bound(_out_buffer.begin(), end, target);
724
8
    if (pos != end) { // found in this frame
725
4
        auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos));
726
4
        *exact_match = _out_buffer[pos_in_frame] == target;
727
4
        _current_index += pos_in_frame;
728
4
        return true;
729
4
    }
730
4
    return false;
731
8
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE29seek_lower_bound_inside_frameEjaPb
Unexecuted instantiation: _ZN5doris10ForDecoderIsE29seek_lower_bound_inside_frameEjsPb
Unexecuted instantiation: _ZN5doris10ForDecoderIiE29seek_lower_bound_inside_frameEjiPb
_ZN5doris10ForDecoderIlE29seek_lower_bound_inside_frameEjlPb
Line
Count
Source
719
8
                                                  bool* exact_match) {
720
8
    _current_index = frame_index * _max_frame_size;
721
8
    decode_current_frame(_out_buffer.data());
722
8
    auto end = _out_buffer.begin() + frame_size(frame_index);
723
8
    auto pos = std::lower_bound(_out_buffer.begin(), end, target);
724
8
    if (pos != end) { // found in this frame
725
4
        auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos));
726
4
        *exact_match = _out_buffer[pos_in_frame] == target;
727
4
        _current_index += pos_in_frame;
728
4
        return true;
729
4
    }
730
4
    return false;
731
8
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE29seek_lower_bound_inside_frameEjnPb
Unexecuted instantiation: _ZN5doris10ForDecoderIhE29seek_lower_bound_inside_frameEjhPb
Unexecuted instantiation: _ZN5doris10ForDecoderItE29seek_lower_bound_inside_frameEjtPb
Unexecuted instantiation: _ZN5doris10ForDecoderIjE29seek_lower_bound_inside_frameEjjPb
Unexecuted instantiation: _ZN5doris10ForDecoderImE29seek_lower_bound_inside_frameEjmPb
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE29seek_lower_bound_inside_frameEjS1_Pb
Unexecuted instantiation: _ZN5doris10ForDecoderIoE29seek_lower_bound_inside_frameEjoPb
732
733
template <typename T>
734
12
bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) {
735
12
    T target = *reinterpret_cast<const T*>(value);
736
12
    uint32_t frame_to_search = seek_last_frame_before_value(target);
737
12
    if (frame_to_search == _frame_count) {
738
        // all frames are >= target, the searched value must the be first value
739
4
        _current_index = 0;
740
4
        decode_current_frame(_out_buffer.data());
741
4
        *exact_match = _out_buffer[0] == target;
742
4
        return true;
743
4
    }
744
    // binary search inside the last frame < target
745
8
    bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match);
746
    // if not found, all values in the last frame are less than target.
747
    // then the searched value must be the first value of the next frame.
748
8
    if (!found && frame_to_search < _frame_count - 1) {
749
2
        _current_index = (frame_to_search + 1) * _max_frame_size;
750
2
        decode_current_frame(_out_buffer.data());
751
2
        *exact_match = _out_buffer[0] == target;
752
2
        return true;
753
2
    }
754
6
    return found;
755
8
}
Unexecuted instantiation: _ZN5doris10ForDecoderIaE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIsE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIiE22seek_at_or_after_valueEPKvPb
_ZN5doris10ForDecoderIlE22seek_at_or_after_valueEPKvPb
Line
Count
Source
734
12
bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) {
735
12
    T target = *reinterpret_cast<const T*>(value);
736
12
    uint32_t frame_to_search = seek_last_frame_before_value(target);
737
12
    if (frame_to_search == _frame_count) {
738
        // all frames are >= target, the searched value must the be first value
739
4
        _current_index = 0;
740
4
        decode_current_frame(_out_buffer.data());
741
4
        *exact_match = _out_buffer[0] == target;
742
4
        return true;
743
4
    }
744
    // binary search inside the last frame < target
745
8
    bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match);
746
    // if not found, all values in the last frame are less than target.
747
    // then the searched value must be the first value of the next frame.
748
8
    if (!found && frame_to_search < _frame_count - 1) {
749
2
        _current_index = (frame_to_search + 1) * _max_frame_size;
750
2
        decode_current_frame(_out_buffer.data());
751
2
        *exact_match = _out_buffer[0] == target;
752
2
        return true;
753
2
    }
754
6
    return found;
755
8
}
Unexecuted instantiation: _ZN5doris10ForDecoderInE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIhE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderItE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIjE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderImE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22seek_at_or_after_valueEPKvPb
Unexecuted instantiation: _ZN5doris10ForDecoderIoE22seek_at_or_after_valueEPKvPb
756
757
template class ForEncoder<int8_t>;
758
template class ForEncoder<int16_t>;
759
template class ForEncoder<int32_t>;
760
template class ForEncoder<int64_t>;
761
template class ForEncoder<int128_t>;
762
template class ForEncoder<uint8_t>;
763
template class ForEncoder<uint16_t>;
764
template class ForEncoder<uint32_t>;
765
template class ForEncoder<uint64_t>;
766
template class ForEncoder<uint24_t>;
767
template class ForEncoder<uint128_t>;
768
769
template class ForDecoder<int8_t>;
770
template class ForDecoder<int16_t>;
771
template class ForDecoder<int32_t>;
772
template class ForDecoder<int64_t>;
773
template class ForDecoder<int128_t>;
774
template class ForDecoder<uint8_t>;
775
template class ForDecoder<uint16_t>;
776
template class ForDecoder<uint32_t>;
777
template class ForDecoder<uint64_t>;
778
template class ForDecoder<uint24_t>;
779
template class ForDecoder<uint128_t>;
780
#include "common/compile_check_end.h"
781
} // namespace doris