be/src/util/frame_of_reference_coding.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/frame_of_reference_coding.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | #include <sys/types.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <cstring> |
25 | | #include <iostream> |
26 | | #include <iterator> |
27 | | #include <limits> |
28 | | |
29 | | #include "common/cast_set.h" |
30 | | #include "exec/common/endian.h" |
31 | | #include "util/bit_util.h" |
32 | | #include "util/coding.h" |
33 | | |
34 | | namespace doris { |
35 | | #include "common/compile_check_begin.h" |
36 | | |
37 | | template <typename T> |
38 | 4.17M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { |
39 | 4.17M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); |
40 | 4.17M | _buffered_values_num += count; |
41 | 4.17M | p_data += count; |
42 | 4.17M | return p_data; |
43 | 4.17M | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10copy_valueEPKam Unexecuted instantiation: _ZN5doris10ForEncoderIsE10copy_valueEPKsm _ZN5doris10ForEncoderIiE10copy_valueEPKim Line | Count | Source | 38 | 8 | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 8 | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 8 | _buffered_values_num += count; | 41 | 8 | p_data += count; | 42 | 8 | return p_data; | 43 | 8 | } |
_ZN5doris10ForEncoderIlE10copy_valueEPKlm Line | Count | Source | 38 | 2.08M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 2.08M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 2.08M | _buffered_values_num += count; | 41 | 2.08M | p_data += count; | 42 | 2.08M | return p_data; | 43 | 2.08M | } |
_ZN5doris10ForEncoderInE10copy_valueEPKnm Line | Count | Source | 38 | 2.08M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 2.08M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 2.08M | _buffered_values_num += count; | 41 | 2.08M | p_data += count; | 42 | 2.08M | return p_data; | 43 | 2.08M | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10copy_valueEPKhm Unexecuted instantiation: _ZN5doris10ForEncoderItE10copy_valueEPKtm _ZN5doris10ForEncoderIjE10copy_valueEPKjm Line | Count | Source | 38 | 3 | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 3 | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 3 | _buffered_values_num += count; | 41 | 3 | p_data += count; | 42 | 3 | return p_data; | 43 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE10copy_valueEPKmm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10copy_valueEPKS1_m Unexecuted instantiation: _ZN5doris10ForEncoderIoE10copy_valueEPKom |
44 | | |
45 | | template <typename T> |
46 | 4.17M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { |
47 | 4.17M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { |
48 | 4.16M | copy_value(in_data, count); |
49 | 4.16M | _values_num += count; |
50 | 4.16M | return; |
51 | 4.16M | } |
52 | | |
53 | | // 1. padding one frame |
54 | 16.3k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; |
55 | 16.3k | in_data = copy_value(in_data, padding_num); |
56 | 16.3k | bit_packing_one_frame_value(_buffered_values); |
57 | | |
58 | | // 2. process frame by frame |
59 | 16.3k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; |
60 | 16.4k | for (size_t i = 0; i < frame_size; i++) { |
61 | | // directly encode value to the bit_writer, don't buffer the value |
62 | 8 | _buffered_values_num = FRAME_VALUE_NUM; |
63 | 8 | bit_packing_one_frame_value(in_data); |
64 | 8 | in_data += FRAME_VALUE_NUM; |
65 | 8 | } |
66 | | |
67 | | // 3. process remaining value |
68 | 16.3k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; |
69 | 16.3k | if (remaining_num > 0) { |
70 | 4 | copy_value(in_data, remaining_num); |
71 | 4 | } |
72 | | |
73 | 16.3k | _values_num += count; |
74 | 16.3k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE9put_batchEPKam Unexecuted instantiation: _ZN5doris10ForEncoderIsE9put_batchEPKsm _ZN5doris10ForEncoderIiE9put_batchEPKim Line | Count | Source | 46 | 7 | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 7 | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 4 | copy_value(in_data, count); | 49 | 4 | _values_num += count; | 50 | 4 | return; | 51 | 4 | } | 52 | | | 53 | | // 1. padding one frame | 54 | 3 | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 3 | in_data = copy_value(in_data, padding_num); | 56 | 3 | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 3 | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 5 | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 2 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 2 | bit_packing_one_frame_value(in_data); | 64 | 2 | in_data += FRAME_VALUE_NUM; | 65 | 2 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 3 | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 3 | if (remaining_num > 0) { | 70 | 1 | copy_value(in_data, remaining_num); | 71 | 1 | } | 72 | | | 73 | 3 | _values_num += count; | 74 | 3 | } |
_ZN5doris10ForEncoderIlE9put_batchEPKlm Line | Count | Source | 46 | 2.08M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 2.08M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 2.08M | copy_value(in_data, count); | 49 | 2.08M | _values_num += count; | 50 | 2.08M | return; | 51 | 2.08M | } | 52 | | | 53 | | // 1. padding one frame | 54 | 8.19k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 8.19k | in_data = copy_value(in_data, padding_num); | 56 | 8.19k | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 8.19k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 8.19k | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 3 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 3 | bit_packing_one_frame_value(in_data); | 64 | 3 | in_data += FRAME_VALUE_NUM; | 65 | 3 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 8.19k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 8.19k | if (remaining_num > 0) { | 70 | 3 | copy_value(in_data, remaining_num); | 71 | 3 | } | 72 | | | 73 | 8.19k | _values_num += count; | 74 | 8.19k | } |
_ZN5doris10ForEncoderInE9put_batchEPKnm Line | Count | Source | 46 | 2.08M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 2.08M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 2.08M | copy_value(in_data, count); | 49 | 2.08M | _values_num += count; | 50 | 2.08M | return; | 51 | 2.08M | } | 52 | | | 53 | | // 1. padding one frame | 54 | 8.19k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 8.19k | in_data = copy_value(in_data, padding_num); | 56 | 8.19k | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 8.19k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 8.19k | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 0 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 0 | bit_packing_one_frame_value(in_data); | 64 | 0 | in_data += FRAME_VALUE_NUM; | 65 | 0 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 8.19k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 8.19k | if (remaining_num > 0) { | 70 | 0 | copy_value(in_data, remaining_num); | 71 | 0 | } | 72 | | | 73 | 8.19k | _values_num += count; | 74 | 8.19k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE9put_batchEPKhm Unexecuted instantiation: _ZN5doris10ForEncoderItE9put_batchEPKtm _ZN5doris10ForEncoderIjE9put_batchEPKjm Line | Count | Source | 46 | 3 | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 3 | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 0 | copy_value(in_data, count); | 49 | 0 | _values_num += count; | 50 | 0 | return; | 51 | 0 | } | 52 | | | 53 | | // 1. padding one frame | 54 | 3 | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 3 | in_data = copy_value(in_data, padding_num); | 56 | 3 | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 3 | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 6 | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 3 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 3 | bit_packing_one_frame_value(in_data); | 64 | 3 | in_data += FRAME_VALUE_NUM; | 65 | 3 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 3 | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 3 | if (remaining_num > 0) { | 70 | 0 | copy_value(in_data, remaining_num); | 71 | 0 | } | 72 | | | 73 | 3 | _values_num += count; | 74 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE9put_batchEPKmm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE9put_batchEPKS1_m Unexecuted instantiation: _ZN5doris10ForEncoderIoE9put_batchEPKom |
75 | | |
76 | | // todo(kks): improve this method by SIMD instructions |
77 | | |
78 | | template <typename T> |
79 | 15.3k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
80 | 15.3k | int64_t s = 0; |
81 | 15.3k | uint8_t output_mask = 255; |
82 | 15.3k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 |
83 | 15.3k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 |
84 | | |
85 | 237k | for (int i = 0; i < full_batch_size; i += 8) { |
86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit |
87 | 222k | s |= static_cast<int64_t>(input[i + 7]); |
88 | 222k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; |
89 | 222k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); |
90 | 222k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); |
91 | 222k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); |
92 | 222k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); |
93 | 222k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); |
94 | 222k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); |
95 | | |
96 | | // Starting with the highest valid bit, take out 8 bits in sequence |
97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end |
99 | 1.22M | for (int j = 0; j < bit_width; j++) { |
100 | 1.00M | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; |
101 | 1.00M | } |
102 | 222k | output += bit_width; |
103 | 222k | s = 0; |
104 | 222k | } |
105 | | |
106 | | // remainder |
107 | 15.3k | int byte = tail_count * bit_width; // How many bits are left to store |
108 | 15.3k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input |
109 | | |
110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit |
111 | 65.4k | for (int i = 0; i < tail_count; i++) { |
112 | 50.1k | s |= (static_cast<int64_t>(input[i + full_batch_size])) |
113 | 50.1k | << ((tail_count - i - 1) * bit_width); |
114 | 50.1k | } |
115 | | |
116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end |
117 | 15.3k | s <<= (bytes << 3) - byte; |
118 | | |
119 | | // Starting with the highest valid bit, take out 8 bits in sequence |
120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. |
121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end |
122 | 48.4k | for (int i = 0; i < bytes; i++) { |
123 | 33.1k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; |
124 | 33.1k | } |
125 | 15.3k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_8EPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_8EPKshiPh _ZN5doris10ForEncoderIiE10bit_pack_8EPKihiPh Line | Count | Source | 79 | 8 | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 8 | int64_t s = 0; | 81 | 8 | uint8_t output_mask = 255; | 82 | 8 | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 8 | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 104 | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 96 | s |= static_cast<int64_t>(input[i + 7]); | 88 | 96 | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 96 | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 96 | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 96 | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 96 | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 96 | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 96 | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 192 | for (int j = 0; j < bit_width; j++) { | 100 | 96 | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 96 | } | 102 | 96 | output += bit_width; | 103 | 96 | s = 0; | 104 | 96 | } | 105 | | | 106 | | // remainder | 107 | 8 | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 8 | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 10 | for (int i = 0; i < tail_count; i++) { | 112 | 2 | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 2 | << ((tail_count - i - 1) * bit_width); | 114 | 2 | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 8 | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 9 | for (int i = 0; i < bytes; i++) { | 123 | 1 | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 1 | } | 125 | 8 | } |
_ZN5doris10ForEncoderIlE10bit_pack_8EPKlhiPh Line | Count | Source | 79 | 3.05k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 3.05k | int64_t s = 0; | 81 | 3.05k | uint8_t output_mask = 255; | 82 | 3.05k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 3.05k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 34.9k | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 31.8k | s |= static_cast<int64_t>(input[i + 7]); | 88 | 31.8k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 31.8k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 31.8k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 31.8k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 31.8k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 31.8k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 31.8k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 174k | for (int j = 0; j < bit_width; j++) { | 100 | 142k | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 142k | } | 102 | 31.8k | output += bit_width; | 103 | 31.8k | s = 0; | 104 | 31.8k | } | 105 | | | 106 | | // remainder | 107 | 3.05k | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 3.05k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 10.2k | for (int i = 0; i < tail_count; i++) { | 112 | 7.16k | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 7.16k | << ((tail_count - i - 1) * bit_width); | 114 | 7.16k | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 3.05k | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 7.77k | for (int i = 0; i < bytes; i++) { | 123 | 4.72k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 4.72k | } | 125 | 3.05k | } |
_ZN5doris10ForEncoderInE10bit_pack_8EPKnhiPh Line | Count | Source | 79 | 12.2k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 12.2k | int64_t s = 0; | 81 | 12.2k | uint8_t output_mask = 255; | 82 | 12.2k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 12.2k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 202k | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 190k | s |= static_cast<int64_t>(input[i + 7]); | 88 | 190k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 190k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 190k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 190k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 190k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 190k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 190k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 1.04M | for (int j = 0; j < bit_width; j++) { | 100 | 857k | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 857k | } | 102 | 190k | output += bit_width; | 103 | 190k | s = 0; | 104 | 190k | } | 105 | | | 106 | | // remainder | 107 | 12.2k | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 12.2k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 55.2k | for (int i = 0; i < tail_count; i++) { | 112 | 43.0k | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 43.0k | << ((tail_count - i - 1) * bit_width); | 114 | 43.0k | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 12.2k | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 40.6k | for (int i = 0; i < bytes; i++) { | 123 | 28.4k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 28.4k | } | 125 | 12.2k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_8EPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_8EPKthiPh _ZN5doris10ForEncoderIjE10bit_pack_8EPKjhiPh Line | Count | Source | 79 | 6 | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 6 | int64_t s = 0; | 81 | 6 | uint8_t output_mask = 255; | 82 | 6 | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 6 | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 102 | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 96 | s |= static_cast<int64_t>(input[i + 7]); | 88 | 96 | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 96 | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 96 | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 96 | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 96 | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 96 | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 96 | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 192 | for (int j = 0; j < bit_width; j++) { | 100 | 96 | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 96 | } | 102 | 96 | output += bit_width; | 103 | 96 | s = 0; | 104 | 96 | } | 105 | | | 106 | | // remainder | 107 | 6 | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 6 | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 6 | for (int i = 0; i < tail_count; i++) { | 112 | 0 | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 0 | << ((tail_count - i - 1) * bit_width); | 114 | 0 | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 6 | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 6 | for (int i = 0; i < bytes; i++) { | 123 | 0 | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 0 | } | 125 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_8EPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_8EPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_8EPKohiPh |
126 | | |
127 | | template <typename T> |
128 | | template <typename U> |
129 | 45.8k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
130 | 45.8k | U s = 0; |
131 | 45.8k | uint8_t output_mask = 255; |
132 | 45.8k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 |
133 | 45.8k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 |
134 | 45.8k | int output_size = 0; // How many outputs can be processed at a time |
135 | 45.8k | int bit_width_remainder = |
136 | 45.8k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time |
137 | 45.8k | int extra_bit = 0; // Extra bits after each process |
138 | | |
139 | 1.40M | for (int i = 0; i < full_batch_size; i += 4) { |
140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit |
141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop |
142 | 1.35M | s <<= bit_width; |
143 | 1.35M | s |= (static_cast<U>(input[i])); |
144 | 1.35M | s <<= bit_width; |
145 | 1.35M | s |= (static_cast<U>(input[i + 1])); |
146 | 1.35M | s <<= bit_width; |
147 | 1.35M | s |= (static_cast<U>(input[i + 2])); |
148 | 1.35M | s <<= bit_width; |
149 | 1.35M | s |= (static_cast<U>(input[i + 3])); |
150 | | |
151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, |
152 | | // and there are extra_bit bits left over from the last loop, |
153 | | // divide by 8 to calculate how much output can be processed in this loop. |
154 | 1.35M | output_size = ((bit_width << 2) + extra_bit) >> 3; |
155 | | |
156 | | // Each loop will leave bit_width_remainder bit unprocessed, |
157 | | // last loop will leave extra_bit bit, eventually will leave |
158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed |
159 | 1.35M | extra_bit = (extra_bit + bit_width_remainder) & 7; |
160 | | |
161 | | // Starting with the highest valid bit, take out 8 bits in sequence |
162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end |
164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit |
165 | 15.2M | for (int j = 0; j < output_size; j++) { |
166 | 13.8M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; |
167 | 13.8M | } |
168 | 1.35M | output += output_size; |
169 | | |
170 | | // s retains the post extra_bit bit as it is not processed |
171 | 1.35M | s &= (1 << extra_bit) - 1; |
172 | 1.35M | } |
173 | | |
174 | | // remainder |
175 | 45.8k | int byte = tail_count * bit_width; // How many bits are left to store |
176 | 45.8k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed |
177 | 45.8k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input |
178 | | |
179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit |
180 | 110k | for (int i = 0; i < tail_count; i++) { |
181 | 64.4k | s <<= bit_width; |
182 | 64.4k | s |= (input[i + full_batch_size]); |
183 | 64.4k | } |
184 | | |
185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end |
186 | 45.8k | s <<= (bytes << 3) - byte; |
187 | | |
188 | | // Starting with the highest valid bit, take out 8 bits in sequence |
189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. |
190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end |
191 | 231k | for (int i = 0; i < bytes; i++) { |
192 | 185k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; |
193 | 185k | } |
194 | 45.8k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4IlEEvPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4InEEvPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4IlEEvPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4InEEvPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4IlEEvPKihiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4InEEvPKihiPh _ZN5doris10ForEncoderIlE10bit_pack_4IlEEvPKlhiPh Line | Count | Source | 129 | 3.03k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 3.03k | U s = 0; | 131 | 3.03k | uint8_t output_mask = 255; | 132 | 3.03k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 3.03k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 3.03k | int output_size = 0; // How many outputs can be processed at a time | 135 | 3.03k | int bit_width_remainder = | 136 | 3.03k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 3.03k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 67.5k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 64.5k | s <<= bit_width; | 143 | 64.5k | s |= (static_cast<U>(input[i])); | 144 | 64.5k | s <<= bit_width; | 145 | 64.5k | s |= (static_cast<U>(input[i + 1])); | 146 | 64.5k | s <<= bit_width; | 147 | 64.5k | s |= (static_cast<U>(input[i + 2])); | 148 | 64.5k | s <<= bit_width; | 149 | 64.5k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 64.5k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 64.5k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 467k | for (int j = 0; j < output_size; j++) { | 166 | 402k | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 402k | } | 168 | 64.5k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 64.5k | s &= (1 << extra_bit) - 1; | 172 | 64.5k | } | 173 | | | 174 | | // remainder | 175 | 3.03k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 3.03k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 3.03k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 6.08k | for (int i = 0; i < tail_count; i++) { | 181 | 3.04k | s <<= bit_width; | 182 | 3.04k | s |= (input[i + full_batch_size]); | 183 | 3.04k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 3.03k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 8.75k | for (int i = 0; i < bytes; i++) { | 192 | 5.71k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 5.71k | } | 194 | 3.03k | } |
_ZN5doris10ForEncoderIlE10bit_pack_4InEEvPKlhiPh Line | Count | Source | 129 | 6.08k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 6.08k | U s = 0; | 131 | 6.08k | uint8_t output_mask = 255; | 132 | 6.08k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 6.08k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 6.08k | int output_size = 0; // How many outputs can be processed at a time | 135 | 6.08k | int bit_width_remainder = | 136 | 6.08k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 6.08k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 135k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 129k | s <<= bit_width; | 143 | 129k | s |= (static_cast<U>(input[i])); | 144 | 129k | s <<= bit_width; | 145 | 129k | s |= (static_cast<U>(input[i + 1])); | 146 | 129k | s <<= bit_width; | 147 | 129k | s |= (static_cast<U>(input[i + 2])); | 148 | 129k | s <<= bit_width; | 149 | 129k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 129k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 129k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 1.70M | for (int j = 0; j < output_size; j++) { | 166 | 1.58M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 1.58M | } | 168 | 129k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 129k | s &= (1 << extra_bit) - 1; | 172 | 129k | } | 173 | | | 174 | | // remainder | 175 | 6.08k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 6.08k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 6.08k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 12.2k | for (int i = 0; i < tail_count; i++) { | 181 | 6.12k | s <<= bit_width; | 182 | 6.12k | s |= (input[i + full_batch_size]); | 183 | 6.12k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 6.08k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 26.7k | for (int i = 0; i < bytes; i++) { | 192 | 20.6k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 20.6k | } | 194 | 6.08k | } |
_ZN5doris10ForEncoderInE10bit_pack_4IlEEvPKnhiPh Line | Count | Source | 129 | 12.2k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 12.2k | U s = 0; | 131 | 12.2k | uint8_t output_mask = 255; | 132 | 12.2k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 12.2k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 12.2k | int output_size = 0; // How many outputs can be processed at a time | 135 | 12.2k | int bit_width_remainder = | 136 | 12.2k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 12.2k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 399k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 387k | s <<= bit_width; | 143 | 387k | s |= (static_cast<U>(input[i])); | 144 | 387k | s <<= bit_width; | 145 | 387k | s |= (static_cast<U>(input[i + 1])); | 146 | 387k | s <<= bit_width; | 147 | 387k | s |= (static_cast<U>(input[i + 2])); | 148 | 387k | s <<= bit_width; | 149 | 387k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 387k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 387k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 2.80M | for (int j = 0; j < output_size; j++) { | 166 | 2.41M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 2.41M | } | 168 | 387k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 387k | s &= (1 << extra_bit) - 1; | 172 | 387k | } | 173 | | | 174 | | // remainder | 175 | 12.2k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 12.2k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 12.2k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 30.6k | for (int i = 0; i < tail_count; i++) { | 181 | 18.4k | s <<= bit_width; | 182 | 18.4k | s |= (input[i + full_batch_size]); | 183 | 18.4k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 12.2k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 46.8k | for (int i = 0; i < bytes; i++) { | 192 | 34.5k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 34.5k | } | 194 | 12.2k | } |
_ZN5doris10ForEncoderInE10bit_pack_4InEEvPKnhiPh Line | Count | Source | 129 | 24.4k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 24.4k | U s = 0; | 131 | 24.4k | uint8_t output_mask = 255; | 132 | 24.4k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 24.4k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 24.4k | int output_size = 0; // How many outputs can be processed at a time | 135 | 24.4k | int bit_width_remainder = | 136 | 24.4k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 24.4k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 798k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 774k | s <<= bit_width; | 143 | 774k | s |= (static_cast<U>(input[i])); | 144 | 774k | s <<= bit_width; | 145 | 774k | s |= (static_cast<U>(input[i + 1])); | 146 | 774k | s <<= bit_width; | 147 | 774k | s |= (static_cast<U>(input[i + 2])); | 148 | 774k | s <<= bit_width; | 149 | 774k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 774k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 774k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 10.2M | for (int j = 0; j < output_size; j++) { | 166 | 9.48M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 9.48M | } | 168 | 774k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 774k | s &= (1 << extra_bit) - 1; | 172 | 774k | } | 173 | | | 174 | | // remainder | 175 | 24.4k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 24.4k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 24.4k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 61.3k | for (int i = 0; i < tail_count; i++) { | 181 | 36.8k | s <<= bit_width; | 182 | 36.8k | s |= (input[i + full_batch_size]); | 183 | 36.8k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 24.4k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 148k | for (int i = 0; i < bytes; i++) { | 192 | 124k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 124k | } | 194 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4IlEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4InEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4IlEEvPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4InEEvPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4IlEEvPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4InEEvPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4IlEEvPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4InEEvPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4IlEEvPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4InEEvPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4IlEEvPKohiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4InEEvPKohiPh |
195 | | |
196 | | template <typename T> |
197 | 181k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
198 | 181k | int output_mask = 255; |
199 | 181k | int need_bit = 0; // still need |
200 | | |
201 | 21.9M | for (int i = 0; i < in_num; i++) { |
202 | 21.7M | T x = input[i]; |
203 | 21.7M | int width = bit_width; |
204 | 21.7M | if (need_bit) { |
205 | | // The last time we take away the high 8 - need_bit, |
206 | | // we need to make up the rest of the need_bit from the width. |
207 | | // Use width - need_bit to compute high need_bit bits |
208 | 15.0M | *output |= x >> (width - need_bit); |
209 | 15.0M | output++; |
210 | | // There are need_bit bits being used, so subtract |
211 | 15.0M | width -= need_bit; |
212 | 15.0M | } |
213 | 21.7M | int num = width >> 3; // How many outputs can be processed at a time |
214 | 21.7M | int remainder = width & 7; // How many bits are left to store |
215 | | |
216 | | // Starting with the highest valid bit, take out 8 bits in sequence |
217 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
218 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end |
219 | | // But since there are still remainder bits that can't be processed, need to add the remainder |
220 | 223M | for (int j = 0; j < num; j++) { |
221 | 202M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); |
222 | 202M | output++; |
223 | 202M | } |
224 | 21.7M | if (remainder) { |
225 | | // Process the last remaining remainder bit. |
226 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. |
227 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit |
228 | 15.1M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); |
229 | | // Already have remainder bits, next time need 8-remainder bits |
230 | 15.1M | need_bit = 8 - remainder; |
231 | 15.1M | } else { |
232 | 6.57M | need_bit = 0; |
233 | 6.57M | } |
234 | 21.7M | } |
235 | 181k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_1EPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_1EPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_1EPKihiPh _ZN5doris10ForEncoderIlE10bit_pack_1EPKlhiPh Line | Count | Source | 197 | 12.1k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 198 | 12.1k | int output_mask = 255; | 199 | 12.1k | int need_bit = 0; // still need | 200 | | | 201 | 1.05M | for (int i = 0; i < in_num; i++) { | 202 | 1.04M | T x = input[i]; | 203 | 1.04M | int width = bit_width; | 204 | 1.04M | if (need_bit) { | 205 | | // The last time we take away the high 8 - need_bit, | 206 | | // we need to make up the rest of the need_bit from the width. | 207 | | // Use width - need_bit to compute high need_bit bits | 208 | 743k | *output |= x >> (width - need_bit); | 209 | 743k | output++; | 210 | | // There are need_bit bits being used, so subtract | 211 | 743k | width -= need_bit; | 212 | 743k | } | 213 | 1.04M | int num = width >> 3; // How many outputs can be processed at a time | 214 | 1.04M | int remainder = width & 7; // How many bits are left to store | 215 | | | 216 | | // Starting with the highest valid bit, take out 8 bits in sequence | 217 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 218 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end | 219 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 220 | 6.62M | for (int j = 0; j < num; j++) { | 221 | 5.58M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); | 222 | 5.58M | output++; | 223 | 5.58M | } | 224 | 1.04M | if (remainder) { | 225 | | // Process the last remaining remainder bit. | 226 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. | 227 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit | 228 | 749k | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); | 229 | | // Already have remainder bits, next time need 8-remainder bits | 230 | 749k | need_bit = 8 - remainder; | 231 | 749k | } else { | 232 | 294k | need_bit = 0; | 233 | 294k | } | 234 | 1.04M | } | 235 | 12.1k | } |
_ZN5doris10ForEncoderInE10bit_pack_1EPKnhiPh Line | Count | Source | 197 | 169k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 198 | 169k | int output_mask = 255; | 199 | 169k | int need_bit = 0; // still need | 200 | | | 201 | 20.8M | for (int i = 0; i < in_num; i++) { | 202 | 20.6M | T x = input[i]; | 203 | 20.6M | int width = bit_width; | 204 | 20.6M | if (need_bit) { | 205 | | // The last time we take away the high 8 - need_bit, | 206 | | // we need to make up the rest of the need_bit from the width. | 207 | | // Use width - need_bit to compute high need_bit bits | 208 | 14.3M | *output |= x >> (width - need_bit); | 209 | 14.3M | output++; | 210 | | // There are need_bit bits being used, so subtract | 211 | 14.3M | width -= need_bit; | 212 | 14.3M | } | 213 | 20.6M | int num = width >> 3; // How many outputs can be processed at a time | 214 | 20.6M | int remainder = width & 7; // How many bits are left to store | 215 | | | 216 | | // Starting with the highest valid bit, take out 8 bits in sequence | 217 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 218 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end | 219 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 220 | 217M | for (int j = 0; j < num; j++) { | 221 | 196M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); | 222 | 196M | output++; | 223 | 196M | } | 224 | 20.6M | if (remainder) { | 225 | | // Process the last remaining remainder bit. | 226 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. | 227 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit | 228 | 14.4M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); | 229 | | // Already have remainder bits, next time need 8-remainder bits | 230 | 14.4M | need_bit = 8 - remainder; | 231 | 14.4M | } else { | 232 | 6.27M | need_bit = 0; | 233 | 6.27M | } | 234 | 20.6M | } | 235 | 169k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_1EPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_1EPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_1EPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_1EPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_1EPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_1EPKohiPh |
236 | | |
237 | | // Use as few bit as possible to store a piece of integer data. |
238 | | // param[in] input: the integer list need to pack |
239 | | // param[in] in_num: the number integer need to pack |
240 | | // param[in] bit_width: how many bit we use to store each integer data |
241 | | // param[out] out: the packed result |
242 | | |
243 | | // For example: |
244 | | // The input is int32 list: 1, 2, 4, 8 and bit_width is 4 |
245 | | // The output will be: 0001 0010 0100 1000 |
246 | | template <typename T> |
247 | 243k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
248 | 243k | if (in_num == 0 || bit_width == 0) { |
249 | 260 | return; |
250 | 260 | } |
251 | | /* |
252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 |
253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 |
254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 |
255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) |
256 | | bit_width <= 64 : pack_1 > pack_4 |
257 | | */ |
258 | 242k | if (bit_width <= 8) { |
259 | 15.3k | bit_pack_8(input, in_num, bit_width, output); |
260 | 227k | } else if (bit_width <= 16) { |
261 | 15.2k | bit_pack_4<int64_t>(input, in_num, bit_width, output); |
262 | 212k | } else if (bit_width <= 32) { |
263 | 30.5k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); |
264 | 181k | } else { |
265 | 181k | bit_pack_1(input, in_num, bit_width, output); |
266 | 181k | } |
267 | 242k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE8bit_packEPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE8bit_packEPKshiPh _ZN5doris10ForEncoderIiE8bit_packEPKihiPh Line | Count | Source | 247 | 9 | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 9 | if (in_num == 0 || bit_width == 0) { | 249 | 1 | return; | 250 | 1 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 8 | if (bit_width <= 8) { | 259 | 8 | bit_pack_8(input, in_num, bit_width, output); | 260 | 8 | } else if (bit_width <= 16) { | 261 | 0 | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 0 | } else if (bit_width <= 32) { | 263 | 0 | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 0 | } else { | 265 | 0 | bit_pack_1(input, in_num, bit_width, output); | 266 | 0 | } | 267 | 8 | } |
_ZN5doris10ForEncoderIlE8bit_packEPKlhiPh Line | Count | Source | 247 | 24.4k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 24.4k | if (in_num == 0 || bit_width == 0) { | 249 | 131 | return; | 250 | 131 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 24.3k | if (bit_width <= 8) { | 259 | 3.05k | bit_pack_8(input, in_num, bit_width, output); | 260 | 21.2k | } else if (bit_width <= 16) { | 261 | 3.03k | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 18.2k | } else if (bit_width <= 32) { | 263 | 6.08k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 12.1k | } else { | 265 | 12.1k | bit_pack_1(input, in_num, bit_width, output); | 266 | 12.1k | } | 267 | 24.3k | } |
_ZN5doris10ForEncoderInE8bit_packEPKnhiPh Line | Count | Source | 247 | 218k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 218k | if (in_num == 0 || bit_width == 0) { | 249 | 128 | return; | 250 | 128 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 218k | if (bit_width <= 8) { | 259 | 12.2k | bit_pack_8(input, in_num, bit_width, output); | 260 | 206k | } else if (bit_width <= 16) { | 261 | 12.2k | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 194k | } else if (bit_width <= 32) { | 263 | 24.4k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 169k | } else { | 265 | 169k | bit_pack_1(input, in_num, bit_width, output); | 266 | 169k | } | 267 | 218k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE8bit_packEPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE8bit_packEPKthiPh _ZN5doris10ForEncoderIjE8bit_packEPKjhiPh Line | Count | Source | 247 | 6 | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 6 | if (in_num == 0 || bit_width == 0) { | 249 | 0 | return; | 250 | 0 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 6 | if (bit_width <= 8) { | 259 | 6 | bit_pack_8(input, in_num, bit_width, output); | 260 | 6 | } else if (bit_width <= 16) { | 261 | 0 | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 0 | } else if (bit_width <= 32) { | 263 | 0 | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 0 | } else { | 265 | 0 | bit_pack_1(input, in_num, bit_width, output); | 266 | 0 | } | 267 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE8bit_packEPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE8bit_packEPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE8bit_packEPKohiPh |
268 | | |
269 | | template <typename T> |
270 | 48.9k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { |
271 | 48.9k | T min = input[0]; |
272 | 48.9k | T max = input[0]; |
273 | 48.9k | bool is_ascending = true; |
274 | 48.9k | uint8_t bit_width = 0; |
275 | 48.9k | T half_max_delta = numeric_limits_max() >> 1; |
276 | 48.9k | bool is_keep_original_value = false; |
277 | | |
278 | | // 1. make sure order_flag, save_original_value, and find max&min. |
279 | 4.18M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { |
280 | 4.13M | if (is_ascending) { |
281 | 86.4k | if (input[i] < input[i - 1]) { |
282 | 48.4k | is_ascending = false; |
283 | 48.4k | } else { |
284 | 38.0k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow |
285 | 0 | is_keep_original_value = true; |
286 | 38.0k | } else { |
287 | 38.0k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); |
288 | 38.0k | } |
289 | 38.0k | } |
290 | 86.4k | } |
291 | | |
292 | 4.13M | if (input[i] < min) { |
293 | 180k | min = input[i]; |
294 | 180k | continue; |
295 | 180k | } |
296 | | |
297 | 3.95M | if (input[i] > max) { |
298 | 183k | max = input[i]; |
299 | 183k | } |
300 | 3.95M | } |
301 | 48.9k | if (!is_ascending) { |
302 | 48.4k | if ((max >> 1) - (min >> 1) > half_max_delta) { |
303 | 0 | is_keep_original_value = true; |
304 | 0 | } |
305 | 48.4k | } |
306 | | |
307 | | // 2. save min value. |
308 | 48.9k | if (sizeof(T) == 16) { |
309 | 24.4k | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); |
310 | 24.4k | } else if (sizeof(T) == 8) { |
311 | 24.4k | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); |
312 | 24.4k | } else { |
313 | 15 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); |
314 | 15 | } |
315 | | |
316 | | // 3.1 save original value. |
317 | 48.9k | if (is_keep_original_value) { |
318 | 0 | bit_width = sizeof(T) * 8; |
319 | 0 | uint32_t len = _buffered_values_num * bit_width; |
320 | 0 | _buffer->reserve(_buffer->size() + len); |
321 | 0 | size_t origin_size = _buffer->size(); |
322 | 0 | _buffer->resize(origin_size + len); |
323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); |
324 | 48.9k | } else { |
325 | | // 3.2 bit pack. |
326 | | // improve for ascending order input, we could use fewer bit |
327 | 48.9k | T delta_values[FRAME_VALUE_NUM]; |
328 | 48.9k | if (is_ascending) { |
329 | 449 | delta_values[0] = 0; |
330 | 3.17k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { |
331 | 2.72k | delta_values[i] = input[i] - input[i - 1]; |
332 | 2.72k | } |
333 | 48.4k | } else { |
334 | 48.4k | bit_width = bits(static_cast<T>(max - min)); |
335 | 4.22M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { |
336 | 4.17M | delta_values[i] = input[i] - min; |
337 | 4.17M | } |
338 | 48.4k | } |
339 | | |
340 | 48.9k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); |
341 | | |
342 | 48.9k | _buffer->reserve(_buffer->size() + packing_len); |
343 | 48.9k | size_t origin_size = _buffer->size(); |
344 | 48.9k | _buffer->resize(origin_size + packing_len); |
345 | 48.9k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); |
346 | 48.9k | } |
347 | 48.9k | uint8_t storage_format = 0; |
348 | 48.9k | if (is_keep_original_value) { |
349 | 0 | storage_format = 2; |
350 | 48.9k | } else if (is_ascending) { |
351 | 449 | storage_format = 1; |
352 | 449 | } |
353 | 48.9k | _storage_formats.push_back(storage_format); |
354 | 48.9k | _bit_widths.push_back(bit_width); |
355 | | |
356 | 48.9k | _buffered_values_num = 0; |
357 | 48.9k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE27bit_packing_one_frame_valueEPKa Unexecuted instantiation: _ZN5doris10ForEncoderIsE27bit_packing_one_frame_valueEPKs _ZN5doris10ForEncoderIiE27bit_packing_one_frame_valueEPKi Line | Count | Source | 270 | 9 | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 9 | T min = input[0]; | 272 | 9 | T max = input[0]; | 273 | 9 | bool is_ascending = true; | 274 | 9 | uint8_t bit_width = 0; | 275 | 9 | T half_max_delta = numeric_limits_max() >> 1; | 276 | 9 | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 771 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 762 | if (is_ascending) { | 281 | 762 | if (input[i] < input[i - 1]) { | 282 | 0 | is_ascending = false; | 283 | 762 | } else { | 284 | 762 | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 762 | } else { | 287 | 762 | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 762 | } | 289 | 762 | } | 290 | 762 | } | 291 | | | 292 | 762 | if (input[i] < min) { | 293 | 0 | min = input[i]; | 294 | 0 | continue; | 295 | 0 | } | 296 | | | 297 | 762 | if (input[i] > max) { | 298 | 762 | max = input[i]; | 299 | 762 | } | 300 | 762 | } | 301 | 9 | if (!is_ascending) { | 302 | 0 | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 0 | } | 306 | | | 307 | | // 2. save min value. | 308 | 9 | if (sizeof(T) == 16) { | 309 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 9 | } else if (sizeof(T) == 8) { | 311 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 9 | } else { | 313 | 9 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 9 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 9 | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 9 | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 9 | T delta_values[FRAME_VALUE_NUM]; | 328 | 9 | if (is_ascending) { | 329 | 9 | delta_values[0] = 0; | 330 | 771 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 762 | delta_values[i] = input[i] - input[i - 1]; | 332 | 762 | } | 333 | 9 | } else { | 334 | 0 | bit_width = bits(static_cast<T>(max - min)); | 335 | 0 | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 0 | delta_values[i] = input[i] - min; | 337 | 0 | } | 338 | 0 | } | 339 | | | 340 | 9 | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 9 | _buffer->reserve(_buffer->size() + packing_len); | 343 | 9 | size_t origin_size = _buffer->size(); | 344 | 9 | _buffer->resize(origin_size + packing_len); | 345 | 9 | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 9 | } | 347 | 9 | uint8_t storage_format = 0; | 348 | 9 | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 9 | } else if (is_ascending) { | 351 | 9 | storage_format = 1; | 352 | 9 | } | 353 | 9 | _storage_formats.push_back(storage_format); | 354 | 9 | _bit_widths.push_back(bit_width); | 355 | | | 356 | 9 | _buffered_values_num = 0; | 357 | 9 | } |
_ZN5doris10ForEncoderIlE27bit_packing_one_frame_valueEPKl Line | Count | Source | 270 | 24.4k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 24.4k | T min = input[0]; | 272 | 24.4k | T max = input[0]; | 273 | 24.4k | bool is_ascending = true; | 274 | 24.4k | uint8_t bit_width = 0; | 275 | 24.4k | T half_max_delta = numeric_limits_max() >> 1; | 276 | 24.4k | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 2.08M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 2.06M | if (is_ascending) { | 281 | 43.3k | if (input[i] < input[i - 1]) { | 282 | 24.2k | is_ascending = false; | 283 | 24.2k | } else { | 284 | 19.1k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 19.1k | } else { | 287 | 19.1k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 19.1k | } | 289 | 19.1k | } | 290 | 43.3k | } | 291 | | | 292 | 2.06M | if (input[i] < min) { | 293 | 88.0k | min = input[i]; | 294 | 88.0k | continue; | 295 | 88.0k | } | 296 | | | 297 | 1.97M | if (input[i] > max) { | 298 | 89.5k | max = input[i]; | 299 | 89.5k | } | 300 | 1.97M | } | 301 | 24.4k | if (!is_ascending) { | 302 | 24.2k | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 24.2k | } | 306 | | | 307 | | // 2. save min value. | 308 | 24.4k | if (sizeof(T) == 16) { | 309 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 24.4k | } else if (sizeof(T) == 8) { | 311 | 24.4k | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 24.4k | } else { | 313 | 0 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 0 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 24.4k | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 24.4k | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 24.4k | T delta_values[FRAME_VALUE_NUM]; | 328 | 24.4k | if (is_ascending) { | 329 | 220 | delta_values[0] = 0; | 330 | 1.29k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 1.07k | delta_values[i] = input[i] - input[i - 1]; | 332 | 1.07k | } | 333 | 24.2k | } else { | 334 | 24.2k | bit_width = bits(static_cast<T>(max - min)); | 335 | 2.11M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 2.08M | delta_values[i] = input[i] - min; | 337 | 2.08M | } | 338 | 24.2k | } | 339 | | | 340 | 24.4k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 24.4k | _buffer->reserve(_buffer->size() + packing_len); | 343 | 24.4k | size_t origin_size = _buffer->size(); | 344 | 24.4k | _buffer->resize(origin_size + packing_len); | 345 | 24.4k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 24.4k | } | 347 | 24.4k | uint8_t storage_format = 0; | 348 | 24.4k | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 24.4k | } else if (is_ascending) { | 351 | 220 | storage_format = 1; | 352 | 220 | } | 353 | 24.4k | _storage_formats.push_back(storage_format); | 354 | 24.4k | _bit_widths.push_back(bit_width); | 355 | | | 356 | 24.4k | _buffered_values_num = 0; | 357 | 24.4k | } |
_ZN5doris10ForEncoderInE27bit_packing_one_frame_valueEPKn Line | Count | Source | 270 | 24.4k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 24.4k | T min = input[0]; | 272 | 24.4k | T max = input[0]; | 273 | 24.4k | bool is_ascending = true; | 274 | 24.4k | uint8_t bit_width = 0; | 275 | 24.4k | T half_max_delta = numeric_limits_max() >> 1; | 276 | 24.4k | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 2.08M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 2.06M | if (is_ascending) { | 281 | 41.5k | if (input[i] < input[i - 1]) { | 282 | 24.2k | is_ascending = false; | 283 | 24.2k | } else { | 284 | 17.3k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 17.3k | } else { | 287 | 17.3k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 17.3k | } | 289 | 17.3k | } | 290 | 41.5k | } | 291 | | | 292 | 2.06M | if (input[i] < min) { | 293 | 92.7k | min = input[i]; | 294 | 92.7k | continue; | 295 | 92.7k | } | 296 | | | 297 | 1.97M | if (input[i] > max) { | 298 | 92.5k | max = input[i]; | 299 | 92.5k | } | 300 | 1.97M | } | 301 | 24.4k | if (!is_ascending) { | 302 | 24.2k | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 24.2k | } | 306 | | | 307 | | // 2. save min value. | 308 | 24.4k | if (sizeof(T) == 16) { | 309 | 24.4k | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 24.4k | } else if (sizeof(T) == 8) { | 311 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 0 | } else { | 313 | 0 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 0 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 24.4k | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 24.4k | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 24.4k | T delta_values[FRAME_VALUE_NUM]; | 328 | 24.4k | if (is_ascending) { | 329 | 214 | delta_values[0] = 0; | 330 | 338 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 124 | delta_values[i] = input[i] - input[i - 1]; | 332 | 124 | } | 333 | 24.2k | } else { | 334 | 24.2k | bit_width = bits(static_cast<T>(max - min)); | 335 | 2.11M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 2.08M | delta_values[i] = input[i] - min; | 337 | 2.08M | } | 338 | 24.2k | } | 339 | | | 340 | 24.4k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 24.4k | _buffer->reserve(_buffer->size() + packing_len); | 343 | 24.4k | size_t origin_size = _buffer->size(); | 344 | 24.4k | _buffer->resize(origin_size + packing_len); | 345 | 24.4k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 24.4k | } | 347 | 24.4k | uint8_t storage_format = 0; | 348 | 24.4k | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 24.4k | } else if (is_ascending) { | 351 | 214 | storage_format = 1; | 352 | 214 | } | 353 | 24.4k | _storage_formats.push_back(storage_format); | 354 | 24.4k | _bit_widths.push_back(bit_width); | 355 | | | 356 | 24.4k | _buffered_values_num = 0; | 357 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE27bit_packing_one_frame_valueEPKh Unexecuted instantiation: _ZN5doris10ForEncoderItE27bit_packing_one_frame_valueEPKt _ZN5doris10ForEncoderIjE27bit_packing_one_frame_valueEPKj Line | Count | Source | 270 | 6 | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 6 | T min = input[0]; | 272 | 6 | T max = input[0]; | 273 | 6 | bool is_ascending = true; | 274 | 6 | uint8_t bit_width = 0; | 275 | 6 | T half_max_delta = numeric_limits_max() >> 1; | 276 | 6 | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 768 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 762 | if (is_ascending) { | 281 | 762 | if (input[i] < input[i - 1]) { | 282 | 0 | is_ascending = false; | 283 | 762 | } else { | 284 | 762 | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 762 | } else { | 287 | 762 | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 762 | } | 289 | 762 | } | 290 | 762 | } | 291 | | | 292 | 762 | if (input[i] < min) { | 293 | 0 | min = input[i]; | 294 | 0 | continue; | 295 | 0 | } | 296 | | | 297 | 762 | if (input[i] > max) { | 298 | 762 | max = input[i]; | 299 | 762 | } | 300 | 762 | } | 301 | 6 | if (!is_ascending) { | 302 | 0 | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 0 | } | 306 | | | 307 | | // 2. save min value. | 308 | 6 | if (sizeof(T) == 16) { | 309 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 6 | } else if (sizeof(T) == 8) { | 311 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 6 | } else { | 313 | 6 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 6 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 6 | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 6 | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 6 | T delta_values[FRAME_VALUE_NUM]; | 328 | 6 | if (is_ascending) { | 329 | 6 | delta_values[0] = 0; | 330 | 768 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 762 | delta_values[i] = input[i] - input[i - 1]; | 332 | 762 | } | 333 | 6 | } else { | 334 | 0 | bit_width = bits(static_cast<T>(max - min)); | 335 | 0 | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 0 | delta_values[i] = input[i] - min; | 337 | 0 | } | 338 | 0 | } | 339 | | | 340 | 6 | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 6 | _buffer->reserve(_buffer->size() + packing_len); | 343 | 6 | size_t origin_size = _buffer->size(); | 344 | 6 | _buffer->resize(origin_size + packing_len); | 345 | 6 | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 6 | } | 347 | 6 | uint8_t storage_format = 0; | 348 | 6 | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 6 | } else if (is_ascending) { | 351 | 6 | storage_format = 1; | 352 | 6 | } | 353 | 6 | _storage_formats.push_back(storage_format); | 354 | 6 | _bit_widths.push_back(bit_width); | 355 | | | 356 | 6 | _buffered_values_num = 0; | 357 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE27bit_packing_one_frame_valueEPKm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE27bit_packing_one_frame_valueEPKS1_ Unexecuted instantiation: _ZN5doris10ForEncoderIoE27bit_packing_one_frame_valueEPKo |
358 | | |
359 | | template <typename T> |
360 | 32.6k | uint32_t ForEncoder<T>::flush() { |
361 | 32.6k | if (_buffered_values_num != 0) { |
362 | 32.5k | bit_packing_one_frame_value(_buffered_values); |
363 | 32.5k | } |
364 | | |
365 | | // write the footer: |
366 | | // 1 _storage_formats and bit_widths |
367 | 32.6k | DCHECK(_storage_formats.size() == _bit_widths.size()) |
368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; |
369 | 81.5k | for (size_t i = 0; i < _storage_formats.size(); i++) { |
370 | 48.9k | _buffer->append(&_storage_formats[i], 1); |
371 | 48.9k | _buffer->append(&_bit_widths[i], 1); |
372 | 48.9k | } |
373 | | // 2 frame_value_num and values_num |
374 | 32.6k | uint8_t frame_value_num = FRAME_VALUE_NUM; |
375 | 32.6k | _buffer->append(&frame_value_num, 1); |
376 | 32.6k | put_fixed32_le(_buffer, _values_num); |
377 | | |
378 | 32.6k | return cast_set<uint32_t>(_buffer->size()); |
379 | 32.6k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderIsE5flushEv _ZN5doris10ForEncoderIiE5flushEv Line | Count | Source | 360 | 7 | uint32_t ForEncoder<T>::flush() { | 361 | 7 | if (_buffered_values_num != 0) { | 362 | 4 | bit_packing_one_frame_value(_buffered_values); | 363 | 4 | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 7 | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 16 | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 9 | _buffer->append(&_storage_formats[i], 1); | 371 | 9 | _buffer->append(&_bit_widths[i], 1); | 372 | 9 | } | 373 | | // 2 frame_value_num and values_num | 374 | 7 | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 7 | _buffer->append(&frame_value_num, 1); | 376 | 7 | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 7 | return cast_set<uint32_t>(_buffer->size()); | 379 | 7 | } |
_ZN5doris10ForEncoderIlE5flushEv Line | Count | Source | 360 | 16.3k | uint32_t ForEncoder<T>::flush() { | 361 | 16.3k | if (_buffered_values_num != 0) { | 362 | 16.2k | bit_packing_one_frame_value(_buffered_values); | 363 | 16.2k | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 16.3k | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 40.7k | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 24.4k | _buffer->append(&_storage_formats[i], 1); | 371 | 24.4k | _buffer->append(&_bit_widths[i], 1); | 372 | 24.4k | } | 373 | | // 2 frame_value_num and values_num | 374 | 16.3k | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 16.3k | _buffer->append(&frame_value_num, 1); | 376 | 16.3k | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 16.3k | return cast_set<uint32_t>(_buffer->size()); | 379 | 16.3k | } |
_ZN5doris10ForEncoderInE5flushEv Line | Count | Source | 360 | 16.3k | uint32_t ForEncoder<T>::flush() { | 361 | 16.3k | if (_buffered_values_num != 0) { | 362 | 16.2k | bit_packing_one_frame_value(_buffered_values); | 363 | 16.2k | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 16.3k | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 40.7k | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 24.4k | _buffer->append(&_storage_formats[i], 1); | 371 | 24.4k | _buffer->append(&_bit_widths[i], 1); | 372 | 24.4k | } | 373 | | // 2 frame_value_num and values_num | 374 | 16.3k | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 16.3k | _buffer->append(&frame_value_num, 1); | 376 | 16.3k | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 16.3k | return cast_set<uint32_t>(_buffer->size()); | 379 | 16.3k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderItE5flushEv _ZN5doris10ForEncoderIjE5flushEv Line | Count | Source | 360 | 3 | uint32_t ForEncoder<T>::flush() { | 361 | 3 | if (_buffered_values_num != 0) { | 362 | 0 | bit_packing_one_frame_value(_buffered_values); | 363 | 0 | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 3 | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 9 | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 6 | _buffer->append(&_storage_formats[i], 1); | 371 | 6 | _buffer->append(&_bit_widths[i], 1); | 372 | 6 | } | 373 | | // 2 frame_value_num and values_num | 374 | 3 | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 3 | _buffer->append(&frame_value_num, 1); | 376 | 3 | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 3 | return cast_set<uint32_t>(_buffer->size()); | 379 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderIoE5flushEv |
380 | | |
381 | | template <typename T> |
382 | 48.9k | const T ForEncoder<T>::numeric_limits_max() { |
383 | 48.9k | return std::numeric_limits<T>::max(); |
384 | 48.9k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderIsE18numeric_limits_maxEv _ZN5doris10ForEncoderIiE18numeric_limits_maxEv Line | Count | Source | 382 | 9 | const T ForEncoder<T>::numeric_limits_max() { | 383 | 9 | return std::numeric_limits<T>::max(); | 384 | 9 | } |
_ZN5doris10ForEncoderIlE18numeric_limits_maxEv Line | Count | Source | 382 | 24.4k | const T ForEncoder<T>::numeric_limits_max() { | 383 | 24.4k | return std::numeric_limits<T>::max(); | 384 | 24.4k | } |
_ZN5doris10ForEncoderInE18numeric_limits_maxEv Line | Count | Source | 382 | 24.4k | const T ForEncoder<T>::numeric_limits_max() { | 383 | 24.4k | return std::numeric_limits<T>::max(); | 384 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderItE18numeric_limits_maxEv _ZN5doris10ForEncoderIjE18numeric_limits_maxEv Line | Count | Source | 382 | 6 | const T ForEncoder<T>::numeric_limits_max() { | 383 | 6 | return std::numeric_limits<T>::max(); | 384 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderIoE18numeric_limits_maxEv |
385 | | |
386 | | template <> |
387 | 0 | const uint24_t ForEncoder<uint24_t>::numeric_limits_max() { |
388 | 0 | return 0XFFFFFF; |
389 | 0 | } |
390 | | |
391 | | template <typename T> |
392 | 32.6k | bool ForDecoder<T>::init() { |
393 | | // When row count is zero, the minimum footer size is 5: |
394 | | // only has ValuesNum(4) + FrameValueNum(1) |
395 | 32.6k | if (_buffer_len < 5) { |
396 | 0 | return false; |
397 | 0 | } |
398 | | |
399 | 32.6k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); |
400 | 32.6k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); |
401 | 32.6k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); |
402 | 32.6k | _last_frame_size = |
403 | 32.6k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); |
404 | | |
405 | 32.6k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; |
406 | | |
407 | | // read _storage_formats, bit_widths and compute frame_offsets |
408 | 32.6k | u_int32_t frame_start_offset = 0; |
409 | 81.5k | for (uint32_t i = 0; i < _frame_count; i++) { |
410 | 48.9k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); |
411 | 48.9k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); |
412 | 48.9k | _bit_widths.push_back(bit_width); |
413 | 48.9k | _storage_formats.push_back(order_flag); |
414 | | |
415 | 48.9k | bit_width_offset += 2; |
416 | | |
417 | 48.9k | _frame_offsets.push_back(frame_start_offset); |
418 | 48.9k | if (sizeof(T) == 16) { |
419 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 16; |
420 | 24.4k | } else if (sizeof(T) == 8) { |
421 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 8; |
422 | 24.4k | } else { |
423 | 15 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; |
424 | 15 | } |
425 | 48.9k | } |
426 | | |
427 | 32.6k | _out_buffer.resize(_max_frame_size); |
428 | 32.6k | _parsed = true; |
429 | | |
430 | 32.6k | return true; |
431 | 32.6k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderIsE4initEv _ZN5doris10ForDecoderIiE4initEv Line | Count | Source | 392 | 7 | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 7 | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 7 | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 7 | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 7 | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 7 | _last_frame_size = | 403 | 7 | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 7 | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 7 | u_int32_t frame_start_offset = 0; | 409 | 16 | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 9 | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 9 | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 9 | _bit_widths.push_back(bit_width); | 413 | 9 | _storage_formats.push_back(order_flag); | 414 | | | 415 | 9 | bit_width_offset += 2; | 416 | | | 417 | 9 | _frame_offsets.push_back(frame_start_offset); | 418 | 9 | if (sizeof(T) == 16) { | 419 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 9 | } else if (sizeof(T) == 8) { | 421 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 9 | } else { | 423 | 9 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 9 | } | 425 | 9 | } | 426 | | | 427 | 7 | _out_buffer.resize(_max_frame_size); | 428 | 7 | _parsed = true; | 429 | | | 430 | 7 | return true; | 431 | 7 | } |
_ZN5doris10ForDecoderIlE4initEv Line | Count | Source | 392 | 16.3k | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 16.3k | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 16.3k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 16.3k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 16.3k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 16.3k | _last_frame_size = | 403 | 16.3k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 16.3k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 16.3k | u_int32_t frame_start_offset = 0; | 409 | 40.7k | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 24.4k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 24.4k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 24.4k | _bit_widths.push_back(bit_width); | 413 | 24.4k | _storage_formats.push_back(order_flag); | 414 | | | 415 | 24.4k | bit_width_offset += 2; | 416 | | | 417 | 24.4k | _frame_offsets.push_back(frame_start_offset); | 418 | 24.4k | if (sizeof(T) == 16) { | 419 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 24.4k | } else if (sizeof(T) == 8) { | 421 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 24.4k | } else { | 423 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 0 | } | 425 | 24.4k | } | 426 | | | 427 | 16.3k | _out_buffer.resize(_max_frame_size); | 428 | 16.3k | _parsed = true; | 429 | | | 430 | 16.3k | return true; | 431 | 16.3k | } |
_ZN5doris10ForDecoderInE4initEv Line | Count | Source | 392 | 16.3k | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 16.3k | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 16.3k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 16.3k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 16.3k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 16.3k | _last_frame_size = | 403 | 16.3k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 16.3k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 16.3k | u_int32_t frame_start_offset = 0; | 409 | 40.7k | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 24.4k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 24.4k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 24.4k | _bit_widths.push_back(bit_width); | 413 | 24.4k | _storage_formats.push_back(order_flag); | 414 | | | 415 | 24.4k | bit_width_offset += 2; | 416 | | | 417 | 24.4k | _frame_offsets.push_back(frame_start_offset); | 418 | 24.4k | if (sizeof(T) == 16) { | 419 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 24.4k | } else if (sizeof(T) == 8) { | 421 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 0 | } else { | 423 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 0 | } | 425 | 24.4k | } | 426 | | | 427 | 16.3k | _out_buffer.resize(_max_frame_size); | 428 | 16.3k | _parsed = true; | 429 | | | 430 | 16.3k | return true; | 431 | 16.3k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderItE4initEv _ZN5doris10ForDecoderIjE4initEv Line | Count | Source | 392 | 3 | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 3 | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 3 | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 3 | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 3 | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 3 | _last_frame_size = | 403 | 3 | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 3 | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 3 | u_int32_t frame_start_offset = 0; | 409 | 9 | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 6 | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 6 | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 6 | _bit_widths.push_back(bit_width); | 413 | 6 | _storage_formats.push_back(order_flag); | 414 | | | 415 | 6 | bit_width_offset += 2; | 416 | | | 417 | 6 | _frame_offsets.push_back(frame_start_offset); | 418 | 6 | if (sizeof(T) == 16) { | 419 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 6 | } else if (sizeof(T) == 8) { | 421 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 6 | } else { | 423 | 6 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 6 | } | 425 | 6 | } | 426 | | | 427 | 3 | _out_buffer.resize(_max_frame_size); | 428 | 3 | _parsed = true; | 429 | | | 430 | 3 | return true; | 431 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderIoE4initEv |
432 | | |
433 | | // todo(kks): improve this method by SIMD instructions |
434 | | |
435 | | template <typename T> |
436 | | template <typename U> |
437 | | void ForDecoder<T>::bit_unpack_optimize(const uint8_t* input, uint8_t in_num, int bit_width, |
438 | 81.3k | T* output) { |
439 | 81.3k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, |
440 | 81.3k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); |
441 | 81.3k | constexpr int u_size = sizeof(U); // Size of U |
442 | 81.3k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) |
443 | 81.3k | int valid_bit = 0; // How many valid bits |
444 | 81.3k | int need_bit = 0; // still need |
445 | 81.3k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size |
446 | 81.3k | int full_batch_size = |
447 | 81.3k | cast_set<int>((input_size >> u_size_shift) |
448 | 81.3k | << u_size_shift); // Adjust input_size to a multiple of u_size |
449 | 81.3k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. |
450 | | // The number of bits in input to adjust to multiples of 8 and thus more |
451 | 81.3k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); |
452 | | |
453 | | // to ensure that only bit_width bits are valid |
454 | 81.3k | T output_mask; |
455 | 81.3k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { |
456 | 0 | output_mask = static_cast<T>(~T(0)); |
457 | 81.3k | } else { |
458 | 81.3k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); |
459 | 81.3k | } |
460 | | |
461 | 81.3k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking |
462 | | |
463 | 4.48M | for (int i = 0; i < full_batch_size; i += u_size) { |
464 | 4.40M | s = 0; |
465 | | |
466 | 4.40M | s = to_endian<std::endian::big>(*((U*)(input + i))); |
467 | | |
468 | | // Determine what the valid bits are based on u_size |
469 | 4.40M | valid_bit = u_size << 3; |
470 | | |
471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. |
472 | 4.40M | if (tail_count == 0 && i == full_batch_size - u_size) { |
473 | 21.7k | valid_bit -= more_bit; |
474 | 21.7k | s >>= more_bit; |
475 | 21.7k | } |
476 | | |
477 | 4.40M | if (need_bit) { |
478 | | // The last time we take away the high bit_width - need_bit, |
479 | | // we need to make up the rest of the need_bit from the width. |
480 | | // Use valid_bit - need_bit to compute high need_bit bits of s |
481 | | // perform an AND operation to ensure that only need_bit bits are valid |
482 | 4.09M | auto mask = (static_cast<U>(1) << need_bit) - 1; |
483 | 4.09M | auto shifted = s >> (valid_bit - need_bit); |
484 | 4.09M | auto masked_result = shifted & mask; |
485 | 4.09M | if constexpr (sizeof(T) <= 4) { |
486 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); |
487 | 4.09M | } else { |
488 | 4.09M | *output |= static_cast<T>(masked_result); |
489 | 4.09M | } |
490 | 4.09M | output++; |
491 | 4.09M | valid_bit -= need_bit; |
492 | 4.09M | } |
493 | | |
494 | 4.40M | int num = valid_bit / bit_width; // How many outputs can be processed at a time |
495 | 4.40M | int remainder = valid_bit - num * bit_width; // How many bits are left to store |
496 | | |
497 | | // Starting with the highest valid bit, take out bit_width bits in sequence |
498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid |
499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end |
500 | | // But since there are still remainder bits that can't be processed, need to add the remainder |
501 | 8.51M | for (int j = 0; j < num; j++) { |
502 | 4.11M | *output = |
503 | 4.11M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); |
504 | 4.11M | output++; |
505 | 4.11M | } |
506 | | |
507 | 4.40M | if (remainder) { |
508 | | // Process the last remaining remainder bit. |
509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. |
510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit |
511 | 4.14M | if constexpr (sizeof(T) <= 4) { |
512 | 0 | auto masked_value = static_cast<T>( |
513 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); |
514 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); |
515 | 4.14M | } else { |
516 | 4.14M | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); |
517 | 4.14M | *output = static_cast<T>(masked_value << (bit_width - remainder)); |
518 | 4.14M | } |
519 | | // Already have remainder bits, next time need bit_width - remainder bits |
520 | 4.14M | need_bit = bit_width - remainder; |
521 | 4.14M | } else { |
522 | 257k | need_bit = 0; |
523 | 257k | } |
524 | 4.40M | } |
525 | | |
526 | | // remainder |
527 | 81.3k | if (tail_count) { |
528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit |
529 | 477k | for (int i = 0; i < tail_count; i++) { |
530 | 417k | s <<= 8; |
531 | 417k | s |= input[full_batch_size + i]; |
532 | 417k | } |
533 | | |
534 | | // tail * 8 is the number of bits that are left to process |
535 | | // tail * 8 - more_bit is to remove the last more_bit |
536 | 59.2k | valid_bit = (tail_count << 3) - more_bit; |
537 | 59.2k | s >>= more_bit; |
538 | | |
539 | | // same as before |
540 | 59.2k | if (need_bit) { |
541 | 54.0k | if constexpr (sizeof(T) <= 4) { |
542 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( |
543 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); |
544 | 54.0k | } else { |
545 | 54.0k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & |
546 | 54.0k | ((static_cast<U>(1) << need_bit) - 1)); |
547 | 54.0k | } |
548 | 54.0k | output++; |
549 | 54.0k | valid_bit -= need_bit; |
550 | 54.0k | } |
551 | | |
552 | 59.2k | int num = valid_bit / bit_width; // How many outputs can be processed at a time |
553 | | |
554 | | // same as before |
555 | 126k | for (int j = 0; j < num; j++) { |
556 | 67.2k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); |
557 | 67.2k | output++; |
558 | 67.2k | } |
559 | 59.2k | } |
560 | 81.3k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeIlEEvPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeInEEvPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeIlEEvPKhhiPs Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeInEEvPKhhiPs _ZN5doris10ForDecoderIiE19bit_unpack_optimizeIlEEvPKhhiPi Line | Count | Source | 438 | 9 | T* output) { | 439 | 9 | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 9 | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 9 | constexpr int u_size = sizeof(U); // Size of U | 442 | 9 | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 9 | int valid_bit = 0; // How many valid bits | 444 | 9 | int need_bit = 0; // still need | 445 | 9 | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 9 | int full_batch_size = | 447 | 9 | cast_set<int>((input_size >> u_size_shift) | 448 | 9 | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 9 | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 9 | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 9 | T output_mask; | 455 | 9 | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 9 | } else { | 458 | 9 | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 9 | } | 460 | | | 461 | 9 | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 21 | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 12 | s = 0; | 465 | | | 466 | 12 | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 12 | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 12 | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 7 | valid_bit -= more_bit; | 474 | 7 | s >>= more_bit; | 475 | 7 | } | 476 | | | 477 | 12 | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 0 | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 0 | auto shifted = s >> (valid_bit - need_bit); | 484 | 0 | auto masked_result = shifted & mask; | 485 | 0 | if constexpr (sizeof(T) <= 4) { | 486 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | | } else { | 488 | | *output |= static_cast<T>(masked_result); | 489 | | } | 490 | 0 | output++; | 491 | 0 | valid_bit -= need_bit; | 492 | 0 | } | 493 | | | 494 | 12 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 12 | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 780 | for (int j = 0; j < num; j++) { | 502 | 768 | *output = | 503 | 768 | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 768 | output++; | 505 | 768 | } | 506 | | | 507 | 12 | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | 0 | if constexpr (sizeof(T) <= 4) { | 512 | 0 | auto masked_value = static_cast<T>( | 513 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | | } else { | 516 | | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 0 | need_bit = bit_width - remainder; | 521 | 12 | } else { | 522 | 12 | need_bit = 0; | 523 | 12 | } | 524 | 12 | } | 525 | | | 526 | | // remainder | 527 | 9 | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 2 | for (int i = 0; i < tail_count; i++) { | 530 | 1 | s <<= 8; | 531 | 1 | s |= input[full_batch_size + i]; | 532 | 1 | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 1 | valid_bit = (tail_count << 3) - more_bit; | 537 | 1 | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 1 | if (need_bit) { | 541 | 0 | if constexpr (sizeof(T) <= 4) { | 542 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | | } else { | 545 | | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | | ((static_cast<U>(1) << need_bit) - 1)); | 547 | | } | 548 | 0 | output++; | 549 | 0 | valid_bit -= need_bit; | 550 | 0 | } | 551 | | | 552 | 1 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 3 | for (int j = 0; j < num; j++) { | 556 | 2 | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 2 | output++; | 558 | 2 | } | 559 | 1 | } | 560 | 9 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIiE19bit_unpack_optimizeInEEvPKhhiPi _ZN5doris10ForDecoderIlE19bit_unpack_optimizeIlEEvPKhhiPl Line | Count | Source | 438 | 12.3k | T* output) { | 439 | 12.3k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 12.3k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 12.3k | constexpr int u_size = sizeof(U); // Size of U | 442 | 12.3k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 12.3k | int valid_bit = 0; // How many valid bits | 444 | 12.3k | int need_bit = 0; // still need | 445 | 12.3k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 12.3k | int full_batch_size = | 447 | 12.3k | cast_set<int>((input_size >> u_size_shift) | 448 | 12.3k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 12.3k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 12.3k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 12.3k | T output_mask; | 455 | 12.3k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 12.3k | } else { | 458 | 12.3k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 12.3k | } | 460 | | | 461 | 12.3k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 278k | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 266k | s = 0; | 465 | | | 466 | 266k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 266k | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 266k | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 5.19k | valid_bit -= more_bit; | 474 | 5.19k | s >>= more_bit; | 475 | 5.19k | } | 476 | | | 477 | 266k | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 207k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 207k | auto shifted = s >> (valid_bit - need_bit); | 484 | 207k | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 207k | } else { | 488 | 207k | *output |= static_cast<T>(masked_result); | 489 | 207k | } | 490 | 207k | output++; | 491 | 207k | valid_bit -= need_bit; | 492 | 207k | } | 493 | | | 494 | 266k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 266k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 1.07M | for (int j = 0; j < num; j++) { | 502 | 809k | *output = | 503 | 809k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 809k | output++; | 505 | 809k | } | 506 | | | 507 | 266k | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 212k | } else { | 516 | 212k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 212k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 212k | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 212k | need_bit = bit_width - remainder; | 521 | 212k | } else { | 522 | 53.9k | need_bit = 0; | 523 | 53.9k | } | 524 | 266k | } | 525 | | | 526 | | // remainder | 527 | 12.3k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 35.0k | for (int i = 0; i < tail_count; i++) { | 530 | 28.0k | s <<= 8; | 531 | 28.0k | s |= input[full_batch_size + i]; | 532 | 28.0k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 6.98k | valid_bit = (tail_count << 3) - more_bit; | 537 | 6.98k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 6.98k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 5.14k | } else { | 545 | 5.14k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 5.14k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 5.14k | } | 548 | 5.14k | output++; | 549 | 5.14k | valid_bit -= need_bit; | 550 | 5.14k | } | 551 | | | 552 | 6.98k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 30.6k | for (int j = 0; j < num; j++) { | 556 | 23.6k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 23.6k | output++; | 558 | 23.6k | } | 559 | 6.98k | } | 560 | 12.3k | } |
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeInEEvPKhhiPl Line | Count | Source | 438 | 12.1k | T* output) { | 439 | 12.1k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 12.1k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 12.1k | constexpr int u_size = sizeof(U); // Size of U | 442 | 12.1k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 12.1k | int valid_bit = 0; // How many valid bits | 444 | 12.1k | int need_bit = 0; // still need | 445 | 12.1k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 12.1k | int full_batch_size = | 447 | 12.1k | cast_set<int>((input_size >> u_size_shift) | 448 | 12.1k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 12.1k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 12.1k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 12.1k | T output_mask; | 455 | 12.1k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 12.1k | } else { | 458 | 12.1k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 12.1k | } | 460 | | | 461 | 12.1k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 403k | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 391k | s = 0; | 465 | | | 466 | 391k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 391k | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 391k | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 4.55k | valid_bit -= more_bit; | 474 | 4.55k | s >>= more_bit; | 475 | 4.55k | } | 476 | | | 477 | 391k | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 367k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 367k | auto shifted = s >> (valid_bit - need_bit); | 484 | 367k | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 367k | } else { | 488 | 367k | *output |= static_cast<T>(masked_result); | 489 | 367k | } | 490 | 367k | output++; | 491 | 367k | valid_bit -= need_bit; | 492 | 367k | } | 493 | | | 494 | 391k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 391k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 1.05M | for (int j = 0; j < num; j++) { | 502 | 663k | *output = | 503 | 663k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 663k | output++; | 505 | 663k | } | 506 | | | 507 | 391k | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 374k | } else { | 516 | 374k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 374k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 374k | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 374k | need_bit = bit_width - remainder; | 521 | 374k | } else { | 522 | 16.9k | need_bit = 0; | 523 | 16.9k | } | 524 | 391k | } | 525 | | | 526 | | // remainder | 527 | 12.1k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 68.5k | for (int i = 0; i < tail_count; i++) { | 530 | 60.9k | s <<= 8; | 531 | 60.9k | s |= input[full_batch_size + i]; | 532 | 60.9k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 7.60k | valid_bit = (tail_count << 3) - more_bit; | 537 | 7.60k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 7.60k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 7.31k | } else { | 545 | 7.31k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 7.31k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 7.31k | } | 548 | 7.31k | output++; | 549 | 7.31k | valid_bit -= need_bit; | 550 | 7.31k | } | 551 | | | 552 | 7.60k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 14.1k | for (int j = 0; j < num; j++) { | 556 | 6.51k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 6.51k | output++; | 558 | 6.51k | } | 559 | 7.60k | } | 560 | 12.1k | } |
_ZN5doris10ForDecoderInE19bit_unpack_optimizeIlEEvPKhhiPn Line | Count | Source | 438 | 8.28k | T* output) { | 439 | 8.28k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 8.28k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 8.28k | constexpr int u_size = sizeof(U); // Size of U | 442 | 8.28k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 8.28k | int valid_bit = 0; // How many valid bits | 444 | 8.28k | int need_bit = 0; // still need | 445 | 8.28k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 8.28k | int full_batch_size = | 447 | 8.28k | cast_set<int>((input_size >> u_size_shift) | 448 | 8.28k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 8.28k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 8.28k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 8.28k | T output_mask; | 455 | 8.28k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 8.28k | } else { | 458 | 8.28k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 8.28k | } | 460 | | | 461 | 8.28k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 274k | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 266k | s = 0; | 465 | | | 466 | 266k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 266k | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 266k | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 1.12k | valid_bit -= more_bit; | 474 | 1.12k | s >>= more_bit; | 475 | 1.12k | } | 476 | | | 477 | 266k | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 207k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 207k | auto shifted = s >> (valid_bit - need_bit); | 484 | 207k | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 207k | } else { | 488 | 207k | *output |= static_cast<T>(masked_result); | 489 | 207k | } | 490 | 207k | output++; | 491 | 207k | valid_bit -= need_bit; | 492 | 207k | } | 493 | | | 494 | 266k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 266k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 1.07M | for (int j = 0; j < num; j++) { | 502 | 808k | *output = | 503 | 808k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 808k | output++; | 505 | 808k | } | 506 | | | 507 | 266k | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 212k | } else { | 516 | 212k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 212k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 212k | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 212k | need_bit = bit_width - remainder; | 521 | 212k | } else { | 522 | 53.9k | need_bit = 0; | 523 | 53.9k | } | 524 | 266k | } | 525 | | | 526 | | // remainder | 527 | 8.28k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 35.2k | for (int i = 0; i < tail_count; i++) { | 530 | 28.1k | s <<= 8; | 531 | 28.1k | s |= input[full_batch_size + i]; | 532 | 28.1k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 7.04k | valid_bit = (tail_count << 3) - more_bit; | 537 | 7.04k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 7.04k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 5.14k | } else { | 545 | 5.14k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 5.14k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 5.14k | } | 548 | 5.14k | output++; | 549 | 5.14k | valid_bit -= need_bit; | 550 | 5.14k | } | 551 | | | 552 | 7.04k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 30.7k | for (int j = 0; j < num; j++) { | 556 | 23.7k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 23.7k | output++; | 558 | 23.7k | } | 559 | 7.04k | } | 560 | 8.28k | } |
_ZN5doris10ForDecoderInE19bit_unpack_optimizeInEEvPKhhiPn Line | Count | Source | 438 | 48.5k | T* output) { | 439 | 48.5k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 48.5k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 48.5k | constexpr int u_size = sizeof(U); // Size of U | 442 | 48.5k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 48.5k | int valid_bit = 0; // How many valid bits | 444 | 48.5k | int need_bit = 0; // still need | 445 | 48.5k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 48.5k | int full_batch_size = | 447 | 48.5k | cast_set<int>((input_size >> u_size_shift) | 448 | 48.5k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 48.5k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 48.5k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 48.5k | T output_mask; | 455 | 48.5k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 48.5k | } else { | 458 | 48.5k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 48.5k | } | 460 | | | 461 | 48.5k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 3.52M | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 3.47M | s = 0; | 465 | | | 466 | 3.47M | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 3.47M | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 3.47M | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 10.8k | valid_bit -= more_bit; | 474 | 10.8k | s >>= more_bit; | 475 | 10.8k | } | 476 | | | 477 | 3.47M | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 3.30M | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 3.30M | auto shifted = s >> (valid_bit - need_bit); | 484 | 3.30M | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 3.30M | } else { | 488 | 3.30M | *output |= static_cast<T>(masked_result); | 489 | 3.30M | } | 490 | 3.30M | output++; | 491 | 3.30M | valid_bit -= need_bit; | 492 | 3.30M | } | 493 | | | 494 | 3.47M | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 3.47M | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 5.30M | for (int j = 0; j < num; j++) { | 502 | 1.83M | *output = | 503 | 1.83M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 1.83M | output++; | 505 | 1.83M | } | 506 | | | 507 | 3.47M | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 3.34M | } else { | 516 | 3.34M | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 3.34M | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 3.34M | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 3.34M | need_bit = bit_width - remainder; | 521 | 3.34M | } else { | 522 | 132k | need_bit = 0; | 523 | 132k | } | 524 | 3.47M | } | 525 | | | 526 | | // remainder | 527 | 48.5k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 338k | for (int i = 0; i < tail_count; i++) { | 530 | 300k | s <<= 8; | 531 | 300k | s |= input[full_batch_size + i]; | 532 | 300k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 37.6k | valid_bit = (tail_count << 3) - more_bit; | 537 | 37.6k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 37.6k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 36.4k | } else { | 545 | 36.4k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 36.4k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 36.4k | } | 548 | 36.4k | output++; | 549 | 36.4k | valid_bit -= need_bit; | 550 | 36.4k | } | 551 | | | 552 | 37.6k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 50.9k | for (int j = 0; j < num; j++) { | 556 | 13.3k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 13.3k | output++; | 558 | 13.3k | } | 559 | 37.6k | } | 560 | 48.5k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeIlEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeInEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeIlEEvPKhhiPt Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeInEEvPKhhiPt _ZN5doris10ForDecoderIjE19bit_unpack_optimizeIlEEvPKhhiPj Line | Count | Source | 438 | 5 | T* output) { | 439 | 5 | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 5 | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 5 | constexpr int u_size = sizeof(U); // Size of U | 442 | 5 | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 5 | int valid_bit = 0; // How many valid bits | 444 | 5 | int need_bit = 0; // still need | 445 | 5 | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 5 | int full_batch_size = | 447 | 5 | cast_set<int>((input_size >> u_size_shift) | 448 | 5 | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 5 | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 5 | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 5 | T output_mask; | 455 | 5 | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 5 | } else { | 458 | 5 | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 5 | } | 460 | | | 461 | 5 | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 15 | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 10 | s = 0; | 465 | | | 466 | 10 | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 10 | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 10 | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 5 | valid_bit -= more_bit; | 474 | 5 | s >>= more_bit; | 475 | 5 | } | 476 | | | 477 | 10 | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 0 | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 0 | auto shifted = s >> (valid_bit - need_bit); | 484 | 0 | auto masked_result = shifted & mask; | 485 | 0 | if constexpr (sizeof(T) <= 4) { | 486 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | | } else { | 488 | | *output |= static_cast<T>(masked_result); | 489 | | } | 490 | 0 | output++; | 491 | 0 | valid_bit -= need_bit; | 492 | 0 | } | 493 | | | 494 | 10 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 10 | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 650 | for (int j = 0; j < num; j++) { | 502 | 640 | *output = | 503 | 640 | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 640 | output++; | 505 | 640 | } | 506 | | | 507 | 10 | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | 0 | if constexpr (sizeof(T) <= 4) { | 512 | 0 | auto masked_value = static_cast<T>( | 513 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | | } else { | 516 | | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 0 | need_bit = bit_width - remainder; | 521 | 10 | } else { | 522 | 10 | need_bit = 0; | 523 | 10 | } | 524 | 10 | } | 525 | | | 526 | | // remainder | 527 | 5 | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 0 | for (int i = 0; i < tail_count; i++) { | 530 | 0 | s <<= 8; | 531 | 0 | s |= input[full_batch_size + i]; | 532 | 0 | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 0 | valid_bit = (tail_count << 3) - more_bit; | 537 | 0 | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 0 | if (need_bit) { | 541 | 0 | if constexpr (sizeof(T) <= 4) { | 542 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | | } else { | 545 | | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | | ((static_cast<U>(1) << need_bit) - 1)); | 547 | | } | 548 | 0 | output++; | 549 | 0 | valid_bit -= need_bit; | 550 | 0 | } | 551 | |
| 552 | 0 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 0 | for (int j = 0; j < num; j++) { | 556 | 0 | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 0 | output++; | 558 | 0 | } | 559 | 0 | } | 560 | 5 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIjE19bit_unpack_optimizeInEEvPKhhiPj Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeIlEEvPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeInEEvPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeIlEEvPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeInEEvPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeIlEEvPKhhiPo Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeInEEvPKhhiPo |
561 | | |
562 | | // The reverse of bit_pack method, get original integer data list from packed bits |
563 | | // param[in] input: the packed bits need to unpack |
564 | | // param[in] in_num: the integer number in packed bits |
565 | | // param[in] bit_width: how many bit we used to store each integer data |
566 | | // param[out] output: the original integer data list |
567 | | template <typename T> |
568 | 81.3k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { |
569 | | /* |
570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. |
571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. |
572 | | */ |
573 | 81.3k | if (bit_width <= 32) { |
574 | 20.6k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); |
575 | 60.6k | } else { |
576 | 60.6k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); |
577 | 60.6k | } |
578 | 81.3k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE10bit_unpackEPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE10bit_unpackEPKhhiPs _ZN5doris10ForDecoderIiE10bit_unpackEPKhhiPi Line | Count | Source | 568 | 9 | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 9 | if (bit_width <= 32) { | 574 | 9 | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 9 | } else { | 576 | 0 | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 0 | } | 578 | 9 | } |
_ZN5doris10ForDecoderIlE10bit_unpackEPKhhiPl Line | Count | Source | 568 | 24.4k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 24.4k | if (bit_width <= 32) { | 574 | 12.3k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 12.3k | } else { | 576 | 12.1k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 12.1k | } | 578 | 24.4k | } |
_ZN5doris10ForDecoderInE10bit_unpackEPKhhiPn Line | Count | Source | 568 | 56.8k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 56.8k | if (bit_width <= 32) { | 574 | 8.28k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 48.5k | } else { | 576 | 48.5k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 48.5k | } | 578 | 56.8k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10bit_unpackEPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderItE10bit_unpackEPKhhiPt _ZN5doris10ForDecoderIjE10bit_unpackEPKhhiPj Line | Count | Source | 568 | 5 | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 5 | if (bit_width <= 32) { | 574 | 5 | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 5 | } else { | 576 | 0 | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 0 | } | 578 | 5 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE10bit_unpackEPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10bit_unpackEPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE10bit_unpackEPKhhiPo |
579 | | |
580 | | template <typename T> |
581 | 4.17M | void ForDecoder<T>::decode_current_frame(T* output) { |
582 | 4.17M | uint32_t frame_index = _current_index / _max_frame_size; |
583 | 4.17M | if (frame_index == _current_decoded_frame) { |
584 | 4.12M | return; // current frame already decoded |
585 | 4.12M | } |
586 | 48.9k | _current_decoded_frame = frame_index; |
587 | 48.9k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); |
588 | | |
589 | 48.9k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; |
590 | 48.9k | T min = 0; |
591 | 48.9k | uint32_t delta_offset = 0; |
592 | 48.9k | if constexpr (sizeof(T) == 16) { |
593 | 24.4k | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); |
594 | 24.4k | delta_offset = base_offset + 16; |
595 | 24.4k | } else if constexpr (sizeof(T) == 8) { |
596 | 24.4k | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); |
597 | 24.4k | delta_offset = base_offset + 8; |
598 | 24.4k | } else { |
599 | 14 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); |
600 | 14 | delta_offset = base_offset + 4; |
601 | 14 | } |
602 | | |
603 | 48.9k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; |
604 | | |
605 | 48.9k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; |
606 | 48.9k | if (is_original_value) { |
607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); |
608 | 48.9k | } else { |
609 | 48.9k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; |
610 | 48.9k | std::vector<T> delta_values(current_frame_size); |
611 | 48.9k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); |
612 | 48.9k | if (is_ascending) { |
613 | 451 | T pre_value = min; |
614 | 3.87k | for (uint8_t i = 0; i < current_frame_size; i++) { |
615 | 3.42k | T value = delta_values[i] + pre_value; |
616 | 3.42k | output[i] = value; |
617 | 3.42k | pre_value = value; |
618 | 3.42k | } |
619 | 48.4k | } else { |
620 | 4.22M | for (uint8_t i = 0; i < current_frame_size; i++) { |
621 | 4.17M | output[i] = delta_values[i] + min; |
622 | 4.17M | } |
623 | 48.4k | } |
624 | 48.9k | } |
625 | 48.9k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE20decode_current_frameEPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE20decode_current_frameEPs _ZN5doris10ForDecoderIiE20decode_current_frameEPi Line | Count | Source | 581 | 10 | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 10 | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 10 | if (frame_index == _current_decoded_frame) { | 584 | 1 | return; // current frame already decoded | 585 | 1 | } | 586 | 9 | _current_decoded_frame = frame_index; | 587 | 9 | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 9 | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 9 | T min = 0; | 591 | 9 | uint32_t delta_offset = 0; | 592 | | if constexpr (sizeof(T) == 16) { | 593 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | | delta_offset = base_offset + 16; | 595 | | } else if constexpr (sizeof(T) == 8) { | 596 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | | delta_offset = base_offset + 8; | 598 | 9 | } else { | 599 | 9 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | 9 | delta_offset = base_offset + 4; | 601 | 9 | } | 602 | | | 603 | 9 | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 9 | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 9 | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 9 | } else { | 609 | 9 | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 9 | std::vector<T> delta_values(current_frame_size); | 611 | 9 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 9 | if (is_ascending) { | 613 | 9 | T pre_value = min; | 614 | 780 | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 771 | T value = delta_values[i] + pre_value; | 616 | 771 | output[i] = value; | 617 | 771 | pre_value = value; | 618 | 771 | } | 619 | 9 | } else { | 620 | 0 | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 0 | output[i] = delta_values[i] + min; | 622 | 0 | } | 623 | 0 | } | 624 | 9 | } | 625 | 9 | } |
_ZN5doris10ForDecoderIlE20decode_current_frameEPl Line | Count | Source | 581 | 2.08M | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 2.08M | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 2.08M | if (frame_index == _current_decoded_frame) { | 584 | 2.06M | return; // current frame already decoded | 585 | 2.06M | } | 586 | 24.4k | _current_decoded_frame = frame_index; | 587 | 24.4k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 24.4k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 24.4k | T min = 0; | 591 | 24.4k | uint32_t delta_offset = 0; | 592 | | if constexpr (sizeof(T) == 16) { | 593 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | | delta_offset = base_offset + 16; | 595 | 24.4k | } else if constexpr (sizeof(T) == 8) { | 596 | 24.4k | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | 24.4k | delta_offset = base_offset + 8; | 598 | | } else { | 599 | | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | | delta_offset = base_offset + 4; | 601 | | } | 602 | | | 603 | 24.4k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 24.4k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 24.4k | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 24.4k | } else { | 609 | 24.4k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 24.4k | std::vector<T> delta_values(current_frame_size); | 611 | 24.4k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 24.4k | if (is_ascending) { | 613 | 223 | T pre_value = min; | 614 | 1.89k | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 1.67k | T value = delta_values[i] + pre_value; | 616 | 1.67k | output[i] = value; | 617 | 1.67k | pre_value = value; | 618 | 1.67k | } | 619 | 24.2k | } else { | 620 | 2.11M | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 2.08M | output[i] = delta_values[i] + min; | 622 | 2.08M | } | 623 | 24.2k | } | 624 | 24.4k | } | 625 | 24.4k | } |
_ZN5doris10ForDecoderInE20decode_current_frameEPn Line | Count | Source | 581 | 2.08M | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 2.08M | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 2.08M | if (frame_index == _current_decoded_frame) { | 584 | 2.06M | return; // current frame already decoded | 585 | 2.06M | } | 586 | 24.4k | _current_decoded_frame = frame_index; | 587 | 24.4k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 24.4k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 24.4k | T min = 0; | 591 | 24.4k | uint32_t delta_offset = 0; | 592 | 24.4k | if constexpr (sizeof(T) == 16) { | 593 | 24.4k | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | 24.4k | delta_offset = base_offset + 16; | 595 | | } else if constexpr (sizeof(T) == 8) { | 596 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | | delta_offset = base_offset + 8; | 598 | | } else { | 599 | | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | | delta_offset = base_offset + 4; | 601 | | } | 602 | | | 603 | 24.4k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 24.4k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 24.4k | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 24.4k | } else { | 609 | 24.4k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 24.4k | std::vector<T> delta_values(current_frame_size); | 611 | 24.4k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 24.4k | if (is_ascending) { | 613 | 214 | T pre_value = min; | 614 | 552 | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 338 | T value = delta_values[i] + pre_value; | 616 | 338 | output[i] = value; | 617 | 338 | pre_value = value; | 618 | 338 | } | 619 | 24.2k | } else { | 620 | 2.11M | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 2.08M | output[i] = delta_values[i] + min; | 622 | 2.08M | } | 623 | 24.2k | } | 624 | 24.4k | } | 625 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE20decode_current_frameEPh Unexecuted instantiation: _ZN5doris10ForDecoderItE20decode_current_frameEPt _ZN5doris10ForDecoderIjE20decode_current_frameEPj Line | Count | Source | 581 | 5 | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 5 | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 5 | if (frame_index == _current_decoded_frame) { | 584 | 0 | return; // current frame already decoded | 585 | 0 | } | 586 | 5 | _current_decoded_frame = frame_index; | 587 | 5 | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 5 | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 5 | T min = 0; | 591 | 5 | uint32_t delta_offset = 0; | 592 | | if constexpr (sizeof(T) == 16) { | 593 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | | delta_offset = base_offset + 16; | 595 | | } else if constexpr (sizeof(T) == 8) { | 596 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | | delta_offset = base_offset + 8; | 598 | 5 | } else { | 599 | 5 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | 5 | delta_offset = base_offset + 4; | 601 | 5 | } | 602 | | | 603 | 5 | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 5 | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 5 | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 5 | } else { | 609 | 5 | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 5 | std::vector<T> delta_values(current_frame_size); | 611 | 5 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 5 | if (is_ascending) { | 613 | 5 | T pre_value = min; | 614 | 645 | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 640 | T value = delta_values[i] + pre_value; | 616 | 640 | output[i] = value; | 617 | 640 | pre_value = value; | 618 | 640 | } | 619 | 5 | } else { | 620 | 0 | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 0 | output[i] = delta_values[i] + min; | 622 | 0 | } | 623 | 0 | } | 624 | 5 | } | 625 | 5 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE20decode_current_frameEPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE20decode_current_frameEPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE20decode_current_frameEPo |
626 | | |
627 | | template <typename T> |
628 | 12 | T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) { |
629 | 12 | uint32_t min_offset = _frame_offsets[frame_index]; |
630 | 12 | T min = 0; |
631 | 12 | if constexpr (sizeof(T) == 16) { |
632 | 0 | min = static_cast<T>(decode_fixed128_le(_buffer + min_offset)); |
633 | 12 | } else if constexpr (sizeof(T) == 8) { |
634 | 12 | min = static_cast<T>(decode_fixed64_le(_buffer + min_offset)); |
635 | 12 | } else { |
636 | 0 | min = static_cast<T>(decode_fixed32_le(_buffer + min_offset)); |
637 | 0 | } |
638 | 12 | return min; |
639 | 12 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIsE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIiE22decode_frame_min_valueEj _ZN5doris10ForDecoderIlE22decode_frame_min_valueEj Line | Count | Source | 628 | 12 | T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) { | 629 | 12 | uint32_t min_offset = _frame_offsets[frame_index]; | 630 | 12 | T min = 0; | 631 | | if constexpr (sizeof(T) == 16) { | 632 | | min = static_cast<T>(decode_fixed128_le(_buffer + min_offset)); | 633 | 12 | } else if constexpr (sizeof(T) == 8) { | 634 | 12 | min = static_cast<T>(decode_fixed64_le(_buffer + min_offset)); | 635 | | } else { | 636 | | min = static_cast<T>(decode_fixed32_le(_buffer + min_offset)); | 637 | | } | 638 | 12 | return min; | 639 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIhE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderItE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIjE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderImE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIoE22decode_frame_min_valueEj |
640 | | |
641 | | template <typename T> |
642 | 4.17M | T* ForDecoder<T>::copy_value(T* val, size_t count) { |
643 | 4.17M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); |
644 | 4.17M | _current_index += count; |
645 | 4.17M | val += count; |
646 | 4.17M | return val; |
647 | 4.17M | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE10copy_valueEPam Unexecuted instantiation: _ZN5doris10ForDecoderIsE10copy_valueEPsm _ZN5doris10ForDecoderIiE10copy_valueEPim Line | Count | Source | 642 | 8 | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 8 | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 8 | _current_index += count; | 645 | 8 | val += count; | 646 | 8 | return val; | 647 | 8 | } |
_ZN5doris10ForDecoderIlE10copy_valueEPlm Line | Count | Source | 642 | 2.08M | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 2.08M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 2.08M | _current_index += count; | 645 | 2.08M | val += count; | 646 | 2.08M | return val; | 647 | 2.08M | } |
_ZN5doris10ForDecoderInE10copy_valueEPnm Line | Count | Source | 642 | 2.08M | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 2.08M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 2.08M | _current_index += count; | 645 | 2.08M | val += count; | 646 | 2.08M | return val; | 647 | 2.08M | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10copy_valueEPhm Unexecuted instantiation: _ZN5doris10ForDecoderItE10copy_valueEPtm _ZN5doris10ForDecoderIjE10copy_valueEPjm Line | Count | Source | 642 | 3 | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 3 | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 3 | _current_index += count; | 645 | 3 | val += count; | 646 | 3 | return val; | 647 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE10copy_valueEPmm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10copy_valueEPS1_m Unexecuted instantiation: _ZN5doris10ForDecoderIoE10copy_valueEPom |
648 | | |
649 | | template <typename T> |
650 | 4.17M | bool ForDecoder<T>::get_batch(T* val, size_t count) { |
651 | 4.17M | if (_current_index + count > _values_num) { |
652 | 1 | return false; |
653 | 1 | } |
654 | | |
655 | 4.17M | decode_current_frame(_out_buffer.data()); |
656 | | |
657 | 4.17M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { |
658 | 4.16M | copy_value(val, count); |
659 | 4.16M | return true; |
660 | 4.16M | } |
661 | | |
662 | | // 1. padding one frame |
663 | 16.3k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; |
664 | 16.3k | val = copy_value(val, padding_num); |
665 | | |
666 | | // 2. process frame by frame |
667 | 16.3k | size_t frame_count = (count - padding_num) / _max_frame_size; |
668 | 16.4k | for (size_t i = 0; i < frame_count; i++) { |
669 | | // directly decode value to the output, don't buffer the value |
670 | 7 | decode_current_frame(val); |
671 | 7 | _current_index += _max_frame_size; |
672 | 7 | val += _max_frame_size; |
673 | 7 | } |
674 | | |
675 | | // 3. process remaining value |
676 | 16.3k | size_t remaining_num = (count - padding_num) % _max_frame_size; |
677 | 16.3k | if (remaining_num > 0) { |
678 | 4 | decode_current_frame(_out_buffer.data()); |
679 | 4 | val = copy_value(val, remaining_num); |
680 | 4 | } |
681 | | |
682 | 16.3k | return true; |
683 | 4.17M | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE9get_batchEPam Unexecuted instantiation: _ZN5doris10ForDecoderIsE9get_batchEPsm _ZN5doris10ForDecoderIiE9get_batchEPim Line | Count | Source | 650 | 8 | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 8 | if (_current_index + count > _values_num) { | 652 | 1 | return false; | 653 | 1 | } | 654 | | | 655 | 7 | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 7 | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 4 | copy_value(val, count); | 659 | 4 | return true; | 660 | 4 | } | 661 | | | 662 | | // 1. padding one frame | 663 | 3 | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 3 | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 3 | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 5 | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 2 | decode_current_frame(val); | 671 | 2 | _current_index += _max_frame_size; | 672 | 2 | val += _max_frame_size; | 673 | 2 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 3 | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 3 | if (remaining_num > 0) { | 678 | 1 | decode_current_frame(_out_buffer.data()); | 679 | 1 | val = copy_value(val, remaining_num); | 680 | 1 | } | 681 | | | 682 | 3 | return true; | 683 | 7 | } |
_ZN5doris10ForDecoderIlE9get_batchEPlm Line | Count | Source | 650 | 2.08M | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 2.08M | if (_current_index + count > _values_num) { | 652 | 0 | return false; | 653 | 0 | } | 654 | | | 655 | 2.08M | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 2.08M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 2.08M | copy_value(val, count); | 659 | 2.08M | return true; | 660 | 2.08M | } | 661 | | | 662 | | // 1. padding one frame | 663 | 8.19k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 8.19k | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 8.19k | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 8.19k | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 3 | decode_current_frame(val); | 671 | 3 | _current_index += _max_frame_size; | 672 | 3 | val += _max_frame_size; | 673 | 3 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 8.19k | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 8.19k | if (remaining_num > 0) { | 678 | 3 | decode_current_frame(_out_buffer.data()); | 679 | 3 | val = copy_value(val, remaining_num); | 680 | 3 | } | 681 | | | 682 | 8.19k | return true; | 683 | 2.08M | } |
_ZN5doris10ForDecoderInE9get_batchEPnm Line | Count | Source | 650 | 2.08M | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 2.08M | if (_current_index + count > _values_num) { | 652 | 0 | return false; | 653 | 0 | } | 654 | | | 655 | 2.08M | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 2.08M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 2.08M | copy_value(val, count); | 659 | 2.08M | return true; | 660 | 2.08M | } | 661 | | | 662 | | // 1. padding one frame | 663 | 8.19k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 8.19k | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 8.19k | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 8.19k | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 0 | decode_current_frame(val); | 671 | 0 | _current_index += _max_frame_size; | 672 | 0 | val += _max_frame_size; | 673 | 0 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 8.19k | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 8.19k | if (remaining_num > 0) { | 678 | 0 | decode_current_frame(_out_buffer.data()); | 679 | 0 | val = copy_value(val, remaining_num); | 680 | 0 | } | 681 | | | 682 | 8.19k | return true; | 683 | 2.08M | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE9get_batchEPhm Unexecuted instantiation: _ZN5doris10ForDecoderItE9get_batchEPtm _ZN5doris10ForDecoderIjE9get_batchEPjm Line | Count | Source | 650 | 3 | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 3 | if (_current_index + count > _values_num) { | 652 | 0 | return false; | 653 | 0 | } | 654 | | | 655 | 3 | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 3 | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 0 | copy_value(val, count); | 659 | 0 | return true; | 660 | 0 | } | 661 | | | 662 | | // 1. padding one frame | 663 | 3 | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 3 | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 3 | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 5 | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 2 | decode_current_frame(val); | 671 | 2 | _current_index += _max_frame_size; | 672 | 2 | val += _max_frame_size; | 673 | 2 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 3 | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 3 | if (remaining_num > 0) { | 678 | 0 | decode_current_frame(_out_buffer.data()); | 679 | 0 | val = copy_value(val, remaining_num); | 680 | 0 | } | 681 | | | 682 | 3 | return true; | 683 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE9get_batchEPmm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE9get_batchEPS1_m Unexecuted instantiation: _ZN5doris10ForDecoderIoE9get_batchEPom |
684 | | |
685 | | template <typename T> |
686 | 3 | bool ForDecoder<T>::skip(int32_t skip_num) { |
687 | 3 | if (_current_index + skip_num >= _values_num) { |
688 | 0 | return false; |
689 | 0 | } |
690 | 3 | _current_index = _current_index + skip_num; |
691 | 3 | return true; |
692 | 3 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIsE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIiE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIlE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderInE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIhE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderItE4skipEi _ZN5doris10ForDecoderIjE4skipEi Line | Count | Source | 686 | 3 | bool ForDecoder<T>::skip(int32_t skip_num) { | 687 | 3 | if (_current_index + skip_num >= _values_num) { | 688 | 0 | return false; | 689 | 0 | } | 690 | 3 | _current_index = _current_index + skip_num; | 691 | 3 | return true; | 692 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIoE4skipEi |
693 | | |
694 | | template <typename T> |
695 | 6 | uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) { |
696 | | // first of all, find the first frame >= target |
697 | 6 | uint32_t left = 0; |
698 | 6 | uint32_t right = _frame_count; |
699 | 18 | while (left < right) { |
700 | 12 | uint32_t mid = left + (right - left) / 2; |
701 | 12 | T midValue = decode_frame_min_value(mid); |
702 | 12 | if (midValue < target) { |
703 | 6 | left = mid + 1; |
704 | 6 | } else { |
705 | 6 | right = mid; |
706 | 6 | } |
707 | 12 | } |
708 | | // after loop, left is the first frame >= target |
709 | 6 | if (left == 0) { |
710 | | // all frames are >= target, not found |
711 | 2 | return _frame_count; |
712 | 2 | } |
713 | | // otherwise previous frame is the last frame < target |
714 | 4 | return left - 1; |
715 | 6 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE28seek_last_frame_before_valueEa Unexecuted instantiation: _ZN5doris10ForDecoderIsE28seek_last_frame_before_valueEs Unexecuted instantiation: _ZN5doris10ForDecoderIiE28seek_last_frame_before_valueEi _ZN5doris10ForDecoderIlE28seek_last_frame_before_valueEl Line | Count | Source | 695 | 6 | uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) { | 696 | | // first of all, find the first frame >= target | 697 | 6 | uint32_t left = 0; | 698 | 6 | uint32_t right = _frame_count; | 699 | 18 | while (left < right) { | 700 | 12 | uint32_t mid = left + (right - left) / 2; | 701 | 12 | T midValue = decode_frame_min_value(mid); | 702 | 12 | if (midValue < target) { | 703 | 6 | left = mid + 1; | 704 | 6 | } else { | 705 | 6 | right = mid; | 706 | 6 | } | 707 | 12 | } | 708 | | // after loop, left is the first frame >= target | 709 | 6 | if (left == 0) { | 710 | | // all frames are >= target, not found | 711 | 2 | return _frame_count; | 712 | 2 | } | 713 | | // otherwise previous frame is the last frame < target | 714 | 4 | return left - 1; | 715 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE28seek_last_frame_before_valueEn Unexecuted instantiation: _ZN5doris10ForDecoderIhE28seek_last_frame_before_valueEh Unexecuted instantiation: _ZN5doris10ForDecoderItE28seek_last_frame_before_valueEt Unexecuted instantiation: _ZN5doris10ForDecoderIjE28seek_last_frame_before_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderImE28seek_last_frame_before_valueEm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE28seek_last_frame_before_valueES1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE28seek_last_frame_before_valueEo |
716 | | |
717 | | template <typename T> |
718 | | bool ForDecoder<T>::seek_lower_bound_inside_frame(uint32_t frame_index, T target, |
719 | 4 | bool* exact_match) { |
720 | 4 | _current_index = frame_index * _max_frame_size; |
721 | 4 | decode_current_frame(_out_buffer.data()); |
722 | 4 | auto end = _out_buffer.begin() + frame_size(frame_index); |
723 | 4 | auto pos = std::lower_bound(_out_buffer.begin(), end, target); |
724 | 4 | if (pos != end) { // found in this frame |
725 | 2 | auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos)); |
726 | 2 | *exact_match = _out_buffer[pos_in_frame] == target; |
727 | 2 | _current_index += pos_in_frame; |
728 | 2 | return true; |
729 | 2 | } |
730 | 2 | return false; |
731 | 4 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE29seek_lower_bound_inside_frameEjaPb Unexecuted instantiation: _ZN5doris10ForDecoderIsE29seek_lower_bound_inside_frameEjsPb Unexecuted instantiation: _ZN5doris10ForDecoderIiE29seek_lower_bound_inside_frameEjiPb _ZN5doris10ForDecoderIlE29seek_lower_bound_inside_frameEjlPb Line | Count | Source | 719 | 4 | bool* exact_match) { | 720 | 4 | _current_index = frame_index * _max_frame_size; | 721 | 4 | decode_current_frame(_out_buffer.data()); | 722 | 4 | auto end = _out_buffer.begin() + frame_size(frame_index); | 723 | 4 | auto pos = std::lower_bound(_out_buffer.begin(), end, target); | 724 | 4 | if (pos != end) { // found in this frame | 725 | 2 | auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos)); | 726 | 2 | *exact_match = _out_buffer[pos_in_frame] == target; | 727 | 2 | _current_index += pos_in_frame; | 728 | 2 | return true; | 729 | 2 | } | 730 | 2 | return false; | 731 | 4 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE29seek_lower_bound_inside_frameEjnPb Unexecuted instantiation: _ZN5doris10ForDecoderIhE29seek_lower_bound_inside_frameEjhPb Unexecuted instantiation: _ZN5doris10ForDecoderItE29seek_lower_bound_inside_frameEjtPb Unexecuted instantiation: _ZN5doris10ForDecoderIjE29seek_lower_bound_inside_frameEjjPb Unexecuted instantiation: _ZN5doris10ForDecoderImE29seek_lower_bound_inside_frameEjmPb Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE29seek_lower_bound_inside_frameEjS1_Pb Unexecuted instantiation: _ZN5doris10ForDecoderIoE29seek_lower_bound_inside_frameEjoPb |
732 | | |
733 | | template <typename T> |
734 | 6 | bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) { |
735 | 6 | T target = *reinterpret_cast<const T*>(value); |
736 | 6 | uint32_t frame_to_search = seek_last_frame_before_value(target); |
737 | 6 | if (frame_to_search == _frame_count) { |
738 | | // all frames are >= target, the searched value must the be first value |
739 | 2 | _current_index = 0; |
740 | 2 | decode_current_frame(_out_buffer.data()); |
741 | 2 | *exact_match = _out_buffer[0] == target; |
742 | 2 | return true; |
743 | 2 | } |
744 | | // binary search inside the last frame < target |
745 | 4 | bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match); |
746 | | // if not found, all values in the last frame are less than target. |
747 | | // then the searched value must be the first value of the next frame. |
748 | 4 | if (!found && frame_to_search < _frame_count - 1) { |
749 | 1 | _current_index = (frame_to_search + 1) * _max_frame_size; |
750 | 1 | decode_current_frame(_out_buffer.data()); |
751 | 1 | *exact_match = _out_buffer[0] == target; |
752 | 1 | return true; |
753 | 1 | } |
754 | 3 | return found; |
755 | 4 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIsE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIiE22seek_at_or_after_valueEPKvPb _ZN5doris10ForDecoderIlE22seek_at_or_after_valueEPKvPb Line | Count | Source | 734 | 6 | bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) { | 735 | 6 | T target = *reinterpret_cast<const T*>(value); | 736 | 6 | uint32_t frame_to_search = seek_last_frame_before_value(target); | 737 | 6 | if (frame_to_search == _frame_count) { | 738 | | // all frames are >= target, the searched value must the be first value | 739 | 2 | _current_index = 0; | 740 | 2 | decode_current_frame(_out_buffer.data()); | 741 | 2 | *exact_match = _out_buffer[0] == target; | 742 | 2 | return true; | 743 | 2 | } | 744 | | // binary search inside the last frame < target | 745 | 4 | bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match); | 746 | | // if not found, all values in the last frame are less than target. | 747 | | // then the searched value must be the first value of the next frame. | 748 | 4 | if (!found && frame_to_search < _frame_count - 1) { | 749 | 1 | _current_index = (frame_to_search + 1) * _max_frame_size; | 750 | 1 | decode_current_frame(_out_buffer.data()); | 751 | 1 | *exact_match = _out_buffer[0] == target; | 752 | 1 | return true; | 753 | 1 | } | 754 | 3 | return found; | 755 | 4 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIhE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderItE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIjE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderImE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIoE22seek_at_or_after_valueEPKvPb |
756 | | |
757 | | template class ForEncoder<int8_t>; |
758 | | template class ForEncoder<int16_t>; |
759 | | template class ForEncoder<int32_t>; |
760 | | template class ForEncoder<int64_t>; |
761 | | template class ForEncoder<int128_t>; |
762 | | template class ForEncoder<uint8_t>; |
763 | | template class ForEncoder<uint16_t>; |
764 | | template class ForEncoder<uint32_t>; |
765 | | template class ForEncoder<uint64_t>; |
766 | | template class ForEncoder<uint24_t>; |
767 | | template class ForEncoder<uint128_t>; |
768 | | |
769 | | template class ForDecoder<int8_t>; |
770 | | template class ForDecoder<int16_t>; |
771 | | template class ForDecoder<int32_t>; |
772 | | template class ForDecoder<int64_t>; |
773 | | template class ForDecoder<int128_t>; |
774 | | template class ForDecoder<uint8_t>; |
775 | | template class ForDecoder<uint16_t>; |
776 | | template class ForDecoder<uint32_t>; |
777 | | template class ForDecoder<uint64_t>; |
778 | | template class ForDecoder<uint24_t>; |
779 | | template class ForDecoder<uint128_t>; |
780 | | #include "common/compile_check_end.h" |
781 | | } // namespace doris |