be/src/util/frame_of_reference_coding.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/frame_of_reference_coding.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | #include <sys/types.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <cstring> |
25 | | #include <iostream> |
26 | | #include <iterator> |
27 | | #include <limits> |
28 | | |
29 | | #include "common/cast_set.h" |
30 | | #include "exec/common/endian.h" |
31 | | #include "util/bit_util.h" |
32 | | #include "util/coding.h" |
33 | | |
34 | | namespace doris { |
35 | | |
36 | | template <typename T> |
37 | 4.17M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { |
38 | 4.17M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); |
39 | 4.17M | _buffered_values_num += count; |
40 | 4.17M | p_data += count; |
41 | 4.17M | return p_data; |
42 | 4.17M | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10copy_valueEPKam Unexecuted instantiation: _ZN5doris10ForEncoderIsE10copy_valueEPKsm _ZN5doris10ForEncoderIiE10copy_valueEPKim Line | Count | Source | 37 | 8 | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 38 | 8 | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 39 | 8 | _buffered_values_num += count; | 40 | 8 | p_data += count; | 41 | 8 | return p_data; | 42 | 8 | } |
_ZN5doris10ForEncoderIlE10copy_valueEPKlm Line | Count | Source | 37 | 2.08M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 38 | 2.08M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 39 | 2.08M | _buffered_values_num += count; | 40 | 2.08M | p_data += count; | 41 | 2.08M | return p_data; | 42 | 2.08M | } |
_ZN5doris10ForEncoderInE10copy_valueEPKnm Line | Count | Source | 37 | 2.08M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 38 | 2.08M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 39 | 2.08M | _buffered_values_num += count; | 40 | 2.08M | p_data += count; | 41 | 2.08M | return p_data; | 42 | 2.08M | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10copy_valueEPKhm Unexecuted instantiation: _ZN5doris10ForEncoderItE10copy_valueEPKtm _ZN5doris10ForEncoderIjE10copy_valueEPKjm Line | Count | Source | 37 | 3 | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 38 | 3 | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 39 | 3 | _buffered_values_num += count; | 40 | 3 | p_data += count; | 41 | 3 | return p_data; | 42 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE10copy_valueEPKmm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10copy_valueEPKS1_m Unexecuted instantiation: _ZN5doris10ForEncoderIoE10copy_valueEPKom |
43 | | |
44 | | template <typename T> |
45 | 4.17M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { |
46 | 4.17M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { |
47 | 4.16M | copy_value(in_data, count); |
48 | 4.16M | _values_num += count; |
49 | 4.16M | return; |
50 | 4.16M | } |
51 | | |
52 | | // 1. padding one frame |
53 | 16.3k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; |
54 | 16.3k | in_data = copy_value(in_data, padding_num); |
55 | 16.3k | bit_packing_one_frame_value(_buffered_values); |
56 | | |
57 | | // 2. process frame by frame |
58 | 16.3k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; |
59 | 16.4k | for (size_t i = 0; i < frame_size; i++) { |
60 | | // directly encode value to the bit_writer, don't buffer the value |
61 | 8 | _buffered_values_num = FRAME_VALUE_NUM; |
62 | 8 | bit_packing_one_frame_value(in_data); |
63 | 8 | in_data += FRAME_VALUE_NUM; |
64 | 8 | } |
65 | | |
66 | | // 3. process remaining value |
67 | 16.3k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; |
68 | 16.3k | if (remaining_num > 0) { |
69 | 4 | copy_value(in_data, remaining_num); |
70 | 4 | } |
71 | | |
72 | 16.3k | _values_num += count; |
73 | 16.3k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE9put_batchEPKam Unexecuted instantiation: _ZN5doris10ForEncoderIsE9put_batchEPKsm _ZN5doris10ForEncoderIiE9put_batchEPKim Line | Count | Source | 45 | 7 | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 46 | 7 | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 47 | 4 | copy_value(in_data, count); | 48 | 4 | _values_num += count; | 49 | 4 | return; | 50 | 4 | } | 51 | | | 52 | | // 1. padding one frame | 53 | 3 | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 54 | 3 | in_data = copy_value(in_data, padding_num); | 55 | 3 | bit_packing_one_frame_value(_buffered_values); | 56 | | | 57 | | // 2. process frame by frame | 58 | 3 | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 59 | 5 | for (size_t i = 0; i < frame_size; i++) { | 60 | | // directly encode value to the bit_writer, don't buffer the value | 61 | 2 | _buffered_values_num = FRAME_VALUE_NUM; | 62 | 2 | bit_packing_one_frame_value(in_data); | 63 | 2 | in_data += FRAME_VALUE_NUM; | 64 | 2 | } | 65 | | | 66 | | // 3. process remaining value | 67 | 3 | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 68 | 3 | if (remaining_num > 0) { | 69 | 1 | copy_value(in_data, remaining_num); | 70 | 1 | } | 71 | | | 72 | 3 | _values_num += count; | 73 | 3 | } |
_ZN5doris10ForEncoderIlE9put_batchEPKlm Line | Count | Source | 45 | 2.08M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 46 | 2.08M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 47 | 2.08M | copy_value(in_data, count); | 48 | 2.08M | _values_num += count; | 49 | 2.08M | return; | 50 | 2.08M | } | 51 | | | 52 | | // 1. padding one frame | 53 | 8.19k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 54 | 8.19k | in_data = copy_value(in_data, padding_num); | 55 | 8.19k | bit_packing_one_frame_value(_buffered_values); | 56 | | | 57 | | // 2. process frame by frame | 58 | 8.19k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 59 | 8.19k | for (size_t i = 0; i < frame_size; i++) { | 60 | | // directly encode value to the bit_writer, don't buffer the value | 61 | 3 | _buffered_values_num = FRAME_VALUE_NUM; | 62 | 3 | bit_packing_one_frame_value(in_data); | 63 | 3 | in_data += FRAME_VALUE_NUM; | 64 | 3 | } | 65 | | | 66 | | // 3. process remaining value | 67 | 8.19k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 68 | 8.19k | if (remaining_num > 0) { | 69 | 3 | copy_value(in_data, remaining_num); | 70 | 3 | } | 71 | | | 72 | 8.19k | _values_num += count; | 73 | 8.19k | } |
_ZN5doris10ForEncoderInE9put_batchEPKnm Line | Count | Source | 45 | 2.08M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 46 | 2.08M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 47 | 2.08M | copy_value(in_data, count); | 48 | 2.08M | _values_num += count; | 49 | 2.08M | return; | 50 | 2.08M | } | 51 | | | 52 | | // 1. padding one frame | 53 | 8.19k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 54 | 8.19k | in_data = copy_value(in_data, padding_num); | 55 | 8.19k | bit_packing_one_frame_value(_buffered_values); | 56 | | | 57 | | // 2. process frame by frame | 58 | 8.19k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 59 | 8.19k | for (size_t i = 0; i < frame_size; i++) { | 60 | | // directly encode value to the bit_writer, don't buffer the value | 61 | 0 | _buffered_values_num = FRAME_VALUE_NUM; | 62 | 0 | bit_packing_one_frame_value(in_data); | 63 | 0 | in_data += FRAME_VALUE_NUM; | 64 | 0 | } | 65 | | | 66 | | // 3. process remaining value | 67 | 8.19k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 68 | 8.19k | if (remaining_num > 0) { | 69 | 0 | copy_value(in_data, remaining_num); | 70 | 0 | } | 71 | | | 72 | 8.19k | _values_num += count; | 73 | 8.19k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE9put_batchEPKhm Unexecuted instantiation: _ZN5doris10ForEncoderItE9put_batchEPKtm _ZN5doris10ForEncoderIjE9put_batchEPKjm Line | Count | Source | 45 | 3 | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 46 | 3 | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 47 | 0 | copy_value(in_data, count); | 48 | 0 | _values_num += count; | 49 | 0 | return; | 50 | 0 | } | 51 | | | 52 | | // 1. padding one frame | 53 | 3 | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 54 | 3 | in_data = copy_value(in_data, padding_num); | 55 | 3 | bit_packing_one_frame_value(_buffered_values); | 56 | | | 57 | | // 2. process frame by frame | 58 | 3 | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 59 | 6 | for (size_t i = 0; i < frame_size; i++) { | 60 | | // directly encode value to the bit_writer, don't buffer the value | 61 | 3 | _buffered_values_num = FRAME_VALUE_NUM; | 62 | 3 | bit_packing_one_frame_value(in_data); | 63 | 3 | in_data += FRAME_VALUE_NUM; | 64 | 3 | } | 65 | | | 66 | | // 3. process remaining value | 67 | 3 | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 68 | 3 | if (remaining_num > 0) { | 69 | 0 | copy_value(in_data, remaining_num); | 70 | 0 | } | 71 | | | 72 | 3 | _values_num += count; | 73 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE9put_batchEPKmm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE9put_batchEPKS1_m Unexecuted instantiation: _ZN5doris10ForEncoderIoE9put_batchEPKom |
74 | | |
75 | | // todo(kks): improve this method by SIMD instructions |
76 | | |
77 | | template <typename T> |
78 | 15.3k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
79 | 15.3k | int64_t s = 0; |
80 | 15.3k | uint8_t output_mask = 255; |
81 | 15.3k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 |
82 | 15.3k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 |
83 | | |
84 | 237k | for (int i = 0; i < full_batch_size; i += 8) { |
85 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit |
86 | 222k | s |= static_cast<int64_t>(input[i + 7]); |
87 | 222k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; |
88 | 222k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); |
89 | 222k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); |
90 | 222k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); |
91 | 222k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); |
92 | 222k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); |
93 | 222k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); |
94 | | |
95 | | // Starting with the highest valid bit, take out 8 bits in sequence |
96 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
97 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end |
98 | 1.22M | for (int j = 0; j < bit_width; j++) { |
99 | 1.00M | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; |
100 | 1.00M | } |
101 | 222k | output += bit_width; |
102 | 222k | s = 0; |
103 | 222k | } |
104 | | |
105 | | // remainder |
106 | 15.3k | int byte = tail_count * bit_width; // How many bits are left to store |
107 | 15.3k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input |
108 | | |
109 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit |
110 | 65.4k | for (int i = 0; i < tail_count; i++) { |
111 | 50.1k | s |= (static_cast<int64_t>(input[i + full_batch_size])) |
112 | 50.1k | << ((tail_count - i - 1) * bit_width); |
113 | 50.1k | } |
114 | | |
115 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end |
116 | 15.3k | s <<= (bytes << 3) - byte; |
117 | | |
118 | | // Starting with the highest valid bit, take out 8 bits in sequence |
119 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. |
120 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end |
121 | 48.4k | for (int i = 0; i < bytes; i++) { |
122 | 33.1k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; |
123 | 33.1k | } |
124 | 15.3k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_8EPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_8EPKshiPh _ZN5doris10ForEncoderIiE10bit_pack_8EPKihiPh Line | Count | Source | 78 | 8 | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 79 | 8 | int64_t s = 0; | 80 | 8 | uint8_t output_mask = 255; | 81 | 8 | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 82 | 8 | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 83 | | | 84 | 104 | for (int i = 0; i < full_batch_size; i += 8) { | 85 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 86 | 96 | s |= static_cast<int64_t>(input[i + 7]); | 87 | 96 | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 88 | 96 | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 89 | 96 | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 90 | 96 | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 91 | 96 | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 92 | 96 | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 93 | 96 | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 94 | | | 95 | | // Starting with the highest valid bit, take out 8 bits in sequence | 96 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 97 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 98 | 192 | for (int j = 0; j < bit_width; j++) { | 99 | 96 | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 100 | 96 | } | 101 | 96 | output += bit_width; | 102 | 96 | s = 0; | 103 | 96 | } | 104 | | | 105 | | // remainder | 106 | 8 | int byte = tail_count * bit_width; // How many bits are left to store | 107 | 8 | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 108 | | | 109 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 110 | 10 | for (int i = 0; i < tail_count; i++) { | 111 | 2 | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 112 | 2 | << ((tail_count - i - 1) * bit_width); | 113 | 2 | } | 114 | | | 115 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 116 | 8 | s <<= (bytes << 3) - byte; | 117 | | | 118 | | // Starting with the highest valid bit, take out 8 bits in sequence | 119 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 120 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 121 | 9 | for (int i = 0; i < bytes; i++) { | 122 | 1 | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 123 | 1 | } | 124 | 8 | } |
_ZN5doris10ForEncoderIlE10bit_pack_8EPKlhiPh Line | Count | Source | 78 | 3.05k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 79 | 3.05k | int64_t s = 0; | 80 | 3.05k | uint8_t output_mask = 255; | 81 | 3.05k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 82 | 3.05k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 83 | | | 84 | 34.9k | for (int i = 0; i < full_batch_size; i += 8) { | 85 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 86 | 31.8k | s |= static_cast<int64_t>(input[i + 7]); | 87 | 31.8k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 88 | 31.8k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 89 | 31.8k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 90 | 31.8k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 91 | 31.8k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 92 | 31.8k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 93 | 31.8k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 94 | | | 95 | | // Starting with the highest valid bit, take out 8 bits in sequence | 96 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 97 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 98 | 174k | for (int j = 0; j < bit_width; j++) { | 99 | 142k | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 100 | 142k | } | 101 | 31.8k | output += bit_width; | 102 | 31.8k | s = 0; | 103 | 31.8k | } | 104 | | | 105 | | // remainder | 106 | 3.05k | int byte = tail_count * bit_width; // How many bits are left to store | 107 | 3.05k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 108 | | | 109 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 110 | 10.2k | for (int i = 0; i < tail_count; i++) { | 111 | 7.16k | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 112 | 7.16k | << ((tail_count - i - 1) * bit_width); | 113 | 7.16k | } | 114 | | | 115 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 116 | 3.05k | s <<= (bytes << 3) - byte; | 117 | | | 118 | | // Starting with the highest valid bit, take out 8 bits in sequence | 119 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 120 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 121 | 7.77k | for (int i = 0; i < bytes; i++) { | 122 | 4.72k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 123 | 4.72k | } | 124 | 3.05k | } |
_ZN5doris10ForEncoderInE10bit_pack_8EPKnhiPh Line | Count | Source | 78 | 12.2k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 79 | 12.2k | int64_t s = 0; | 80 | 12.2k | uint8_t output_mask = 255; | 81 | 12.2k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 82 | 12.2k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 83 | | | 84 | 202k | for (int i = 0; i < full_batch_size; i += 8) { | 85 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 86 | 190k | s |= static_cast<int64_t>(input[i + 7]); | 87 | 190k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 88 | 190k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 89 | 190k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 90 | 190k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 91 | 190k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 92 | 190k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 93 | 190k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 94 | | | 95 | | // Starting with the highest valid bit, take out 8 bits in sequence | 96 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 97 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 98 | 1.04M | for (int j = 0; j < bit_width; j++) { | 99 | 857k | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 100 | 857k | } | 101 | 190k | output += bit_width; | 102 | 190k | s = 0; | 103 | 190k | } | 104 | | | 105 | | // remainder | 106 | 12.2k | int byte = tail_count * bit_width; // How many bits are left to store | 107 | 12.2k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 108 | | | 109 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 110 | 55.2k | for (int i = 0; i < tail_count; i++) { | 111 | 43.0k | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 112 | 43.0k | << ((tail_count - i - 1) * bit_width); | 113 | 43.0k | } | 114 | | | 115 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 116 | 12.2k | s <<= (bytes << 3) - byte; | 117 | | | 118 | | // Starting with the highest valid bit, take out 8 bits in sequence | 119 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 120 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 121 | 40.6k | for (int i = 0; i < bytes; i++) { | 122 | 28.4k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 123 | 28.4k | } | 124 | 12.2k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_8EPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_8EPKthiPh _ZN5doris10ForEncoderIjE10bit_pack_8EPKjhiPh Line | Count | Source | 78 | 6 | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 79 | 6 | int64_t s = 0; | 80 | 6 | uint8_t output_mask = 255; | 81 | 6 | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 82 | 6 | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 83 | | | 84 | 102 | for (int i = 0; i < full_batch_size; i += 8) { | 85 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 86 | 96 | s |= static_cast<int64_t>(input[i + 7]); | 87 | 96 | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 88 | 96 | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 89 | 96 | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 90 | 96 | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 91 | 96 | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 92 | 96 | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 93 | 96 | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 94 | | | 95 | | // Starting with the highest valid bit, take out 8 bits in sequence | 96 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 97 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 98 | 192 | for (int j = 0; j < bit_width; j++) { | 99 | 96 | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 100 | 96 | } | 101 | 96 | output += bit_width; | 102 | 96 | s = 0; | 103 | 96 | } | 104 | | | 105 | | // remainder | 106 | 6 | int byte = tail_count * bit_width; // How many bits are left to store | 107 | 6 | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 108 | | | 109 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 110 | 6 | for (int i = 0; i < tail_count; i++) { | 111 | 0 | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 112 | 0 | << ((tail_count - i - 1) * bit_width); | 113 | 0 | } | 114 | | | 115 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 116 | 6 | s <<= (bytes << 3) - byte; | 117 | | | 118 | | // Starting with the highest valid bit, take out 8 bits in sequence | 119 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 120 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 121 | 6 | for (int i = 0; i < bytes; i++) { | 122 | 0 | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 123 | 0 | } | 124 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_8EPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_8EPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_8EPKohiPh |
125 | | |
126 | | template <typename T> |
127 | | template <typename U> |
128 | 45.8k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
129 | 45.8k | U s = 0; |
130 | 45.8k | uint8_t output_mask = 255; |
131 | 45.8k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 |
132 | 45.8k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 |
133 | 45.8k | int output_size = 0; // How many outputs can be processed at a time |
134 | 45.8k | int bit_width_remainder = |
135 | 45.8k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time |
136 | 45.8k | int extra_bit = 0; // Extra bits after each process |
137 | | |
138 | 1.40M | for (int i = 0; i < full_batch_size; i += 4) { |
139 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit |
140 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop |
141 | 1.35M | s <<= bit_width; |
142 | 1.35M | s |= (static_cast<U>(input[i])); |
143 | 1.35M | s <<= bit_width; |
144 | 1.35M | s |= (static_cast<U>(input[i + 1])); |
145 | 1.35M | s <<= bit_width; |
146 | 1.35M | s |= (static_cast<U>(input[i + 2])); |
147 | 1.35M | s <<= bit_width; |
148 | 1.35M | s |= (static_cast<U>(input[i + 3])); |
149 | | |
150 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, |
151 | | // and there are extra_bit bits left over from the last loop, |
152 | | // divide by 8 to calculate how much output can be processed in this loop. |
153 | 1.35M | output_size = ((bit_width << 2) + extra_bit) >> 3; |
154 | | |
155 | | // Each loop will leave bit_width_remainder bit unprocessed, |
156 | | // last loop will leave extra_bit bit, eventually will leave |
157 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed |
158 | 1.35M | extra_bit = (extra_bit + bit_width_remainder) & 7; |
159 | | |
160 | | // Starting with the highest valid bit, take out 8 bits in sequence |
161 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
162 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end |
163 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit |
164 | 15.2M | for (int j = 0; j < output_size; j++) { |
165 | 13.8M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; |
166 | 13.8M | } |
167 | 1.35M | output += output_size; |
168 | | |
169 | | // s retains the post extra_bit bit as it is not processed |
170 | 1.35M | s &= (1 << extra_bit) - 1; |
171 | 1.35M | } |
172 | | |
173 | | // remainder |
174 | 45.8k | int byte = tail_count * bit_width; // How many bits are left to store |
175 | 45.8k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed |
176 | 45.8k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input |
177 | | |
178 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit |
179 | 110k | for (int i = 0; i < tail_count; i++) { |
180 | 64.4k | s <<= bit_width; |
181 | 64.4k | s |= (input[i + full_batch_size]); |
182 | 64.4k | } |
183 | | |
184 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end |
185 | 45.8k | s <<= (bytes << 3) - byte; |
186 | | |
187 | | // Starting with the highest valid bit, take out 8 bits in sequence |
188 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. |
189 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end |
190 | 231k | for (int i = 0; i < bytes; i++) { |
191 | 185k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; |
192 | 185k | } |
193 | 45.8k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4IlEEvPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4InEEvPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4IlEEvPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4InEEvPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4IlEEvPKihiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4InEEvPKihiPh _ZN5doris10ForEncoderIlE10bit_pack_4IlEEvPKlhiPh Line | Count | Source | 128 | 3.03k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 129 | 3.03k | U s = 0; | 130 | 3.03k | uint8_t output_mask = 255; | 131 | 3.03k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 132 | 3.03k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 133 | 3.03k | int output_size = 0; // How many outputs can be processed at a time | 134 | 3.03k | int bit_width_remainder = | 135 | 3.03k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 136 | 3.03k | int extra_bit = 0; // Extra bits after each process | 137 | | | 138 | 67.5k | for (int i = 0; i < full_batch_size; i += 4) { | 139 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 140 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 141 | 64.5k | s <<= bit_width; | 142 | 64.5k | s |= (static_cast<U>(input[i])); | 143 | 64.5k | s <<= bit_width; | 144 | 64.5k | s |= (static_cast<U>(input[i + 1])); | 145 | 64.5k | s <<= bit_width; | 146 | 64.5k | s |= (static_cast<U>(input[i + 2])); | 147 | 64.5k | s <<= bit_width; | 148 | 64.5k | s |= (static_cast<U>(input[i + 3])); | 149 | | | 150 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 151 | | // and there are extra_bit bits left over from the last loop, | 152 | | // divide by 8 to calculate how much output can be processed in this loop. | 153 | 64.5k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 154 | | | 155 | | // Each loop will leave bit_width_remainder bit unprocessed, | 156 | | // last loop will leave extra_bit bit, eventually will leave | 157 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 158 | 64.5k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 159 | | | 160 | | // Starting with the highest valid bit, take out 8 bits in sequence | 161 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 162 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 163 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 164 | 467k | for (int j = 0; j < output_size; j++) { | 165 | 402k | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 166 | 402k | } | 167 | 64.5k | output += output_size; | 168 | | | 169 | | // s retains the post extra_bit bit as it is not processed | 170 | 64.5k | s &= (1 << extra_bit) - 1; | 171 | 64.5k | } | 172 | | | 173 | | // remainder | 174 | 3.03k | int byte = tail_count * bit_width; // How many bits are left to store | 175 | 3.03k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 176 | 3.03k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 177 | | | 178 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 179 | 6.08k | for (int i = 0; i < tail_count; i++) { | 180 | 3.04k | s <<= bit_width; | 181 | 3.04k | s |= (input[i + full_batch_size]); | 182 | 3.04k | } | 183 | | | 184 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 185 | 3.03k | s <<= (bytes << 3) - byte; | 186 | | | 187 | | // Starting with the highest valid bit, take out 8 bits in sequence | 188 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 189 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 190 | 8.75k | for (int i = 0; i < bytes; i++) { | 191 | 5.71k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 192 | 5.71k | } | 193 | 3.03k | } |
_ZN5doris10ForEncoderIlE10bit_pack_4InEEvPKlhiPh Line | Count | Source | 128 | 6.08k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 129 | 6.08k | U s = 0; | 130 | 6.08k | uint8_t output_mask = 255; | 131 | 6.08k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 132 | 6.08k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 133 | 6.08k | int output_size = 0; // How many outputs can be processed at a time | 134 | 6.08k | int bit_width_remainder = | 135 | 6.08k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 136 | 6.08k | int extra_bit = 0; // Extra bits after each process | 137 | | | 138 | 135k | for (int i = 0; i < full_batch_size; i += 4) { | 139 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 140 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 141 | 129k | s <<= bit_width; | 142 | 129k | s |= (static_cast<U>(input[i])); | 143 | 129k | s <<= bit_width; | 144 | 129k | s |= (static_cast<U>(input[i + 1])); | 145 | 129k | s <<= bit_width; | 146 | 129k | s |= (static_cast<U>(input[i + 2])); | 147 | 129k | s <<= bit_width; | 148 | 129k | s |= (static_cast<U>(input[i + 3])); | 149 | | | 150 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 151 | | // and there are extra_bit bits left over from the last loop, | 152 | | // divide by 8 to calculate how much output can be processed in this loop. | 153 | 129k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 154 | | | 155 | | // Each loop will leave bit_width_remainder bit unprocessed, | 156 | | // last loop will leave extra_bit bit, eventually will leave | 157 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 158 | 129k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 159 | | | 160 | | // Starting with the highest valid bit, take out 8 bits in sequence | 161 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 162 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 163 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 164 | 1.70M | for (int j = 0; j < output_size; j++) { | 165 | 1.58M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 166 | 1.58M | } | 167 | 129k | output += output_size; | 168 | | | 169 | | // s retains the post extra_bit bit as it is not processed | 170 | 129k | s &= (1 << extra_bit) - 1; | 171 | 129k | } | 172 | | | 173 | | // remainder | 174 | 6.08k | int byte = tail_count * bit_width; // How many bits are left to store | 175 | 6.08k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 176 | 6.08k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 177 | | | 178 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 179 | 12.2k | for (int i = 0; i < tail_count; i++) { | 180 | 6.12k | s <<= bit_width; | 181 | 6.12k | s |= (input[i + full_batch_size]); | 182 | 6.12k | } | 183 | | | 184 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 185 | 6.08k | s <<= (bytes << 3) - byte; | 186 | | | 187 | | // Starting with the highest valid bit, take out 8 bits in sequence | 188 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 189 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 190 | 26.7k | for (int i = 0; i < bytes; i++) { | 191 | 20.6k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 192 | 20.6k | } | 193 | 6.08k | } |
_ZN5doris10ForEncoderInE10bit_pack_4IlEEvPKnhiPh Line | Count | Source | 128 | 12.2k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 129 | 12.2k | U s = 0; | 130 | 12.2k | uint8_t output_mask = 255; | 131 | 12.2k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 132 | 12.2k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 133 | 12.2k | int output_size = 0; // How many outputs can be processed at a time | 134 | 12.2k | int bit_width_remainder = | 135 | 12.2k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 136 | 12.2k | int extra_bit = 0; // Extra bits after each process | 137 | | | 138 | 399k | for (int i = 0; i < full_batch_size; i += 4) { | 139 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 140 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 141 | 387k | s <<= bit_width; | 142 | 387k | s |= (static_cast<U>(input[i])); | 143 | 387k | s <<= bit_width; | 144 | 387k | s |= (static_cast<U>(input[i + 1])); | 145 | 387k | s <<= bit_width; | 146 | 387k | s |= (static_cast<U>(input[i + 2])); | 147 | 387k | s <<= bit_width; | 148 | 387k | s |= (static_cast<U>(input[i + 3])); | 149 | | | 150 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 151 | | // and there are extra_bit bits left over from the last loop, | 152 | | // divide by 8 to calculate how much output can be processed in this loop. | 153 | 387k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 154 | | | 155 | | // Each loop will leave bit_width_remainder bit unprocessed, | 156 | | // last loop will leave extra_bit bit, eventually will leave | 157 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 158 | 387k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 159 | | | 160 | | // Starting with the highest valid bit, take out 8 bits in sequence | 161 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 162 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 163 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 164 | 2.80M | for (int j = 0; j < output_size; j++) { | 165 | 2.41M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 166 | 2.41M | } | 167 | 387k | output += output_size; | 168 | | | 169 | | // s retains the post extra_bit bit as it is not processed | 170 | 387k | s &= (1 << extra_bit) - 1; | 171 | 387k | } | 172 | | | 173 | | // remainder | 174 | 12.2k | int byte = tail_count * bit_width; // How many bits are left to store | 175 | 12.2k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 176 | 12.2k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 177 | | | 178 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 179 | 30.6k | for (int i = 0; i < tail_count; i++) { | 180 | 18.4k | s <<= bit_width; | 181 | 18.4k | s |= (input[i + full_batch_size]); | 182 | 18.4k | } | 183 | | | 184 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 185 | 12.2k | s <<= (bytes << 3) - byte; | 186 | | | 187 | | // Starting with the highest valid bit, take out 8 bits in sequence | 188 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 189 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 190 | 46.8k | for (int i = 0; i < bytes; i++) { | 191 | 34.5k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 192 | 34.5k | } | 193 | 12.2k | } |
_ZN5doris10ForEncoderInE10bit_pack_4InEEvPKnhiPh Line | Count | Source | 128 | 24.4k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 129 | 24.4k | U s = 0; | 130 | 24.4k | uint8_t output_mask = 255; | 131 | 24.4k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 132 | 24.4k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 133 | 24.4k | int output_size = 0; // How many outputs can be processed at a time | 134 | 24.4k | int bit_width_remainder = | 135 | 24.4k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 136 | 24.4k | int extra_bit = 0; // Extra bits after each process | 137 | | | 138 | 798k | for (int i = 0; i < full_batch_size; i += 4) { | 139 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 140 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 141 | 774k | s <<= bit_width; | 142 | 774k | s |= (static_cast<U>(input[i])); | 143 | 774k | s <<= bit_width; | 144 | 774k | s |= (static_cast<U>(input[i + 1])); | 145 | 774k | s <<= bit_width; | 146 | 774k | s |= (static_cast<U>(input[i + 2])); | 147 | 774k | s <<= bit_width; | 148 | 774k | s |= (static_cast<U>(input[i + 3])); | 149 | | | 150 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 151 | | // and there are extra_bit bits left over from the last loop, | 152 | | // divide by 8 to calculate how much output can be processed in this loop. | 153 | 774k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 154 | | | 155 | | // Each loop will leave bit_width_remainder bit unprocessed, | 156 | | // last loop will leave extra_bit bit, eventually will leave | 157 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 158 | 774k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 159 | | | 160 | | // Starting with the highest valid bit, take out 8 bits in sequence | 161 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 162 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 163 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 164 | 10.2M | for (int j = 0; j < output_size; j++) { | 165 | 9.48M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 166 | 9.48M | } | 167 | 774k | output += output_size; | 168 | | | 169 | | // s retains the post extra_bit bit as it is not processed | 170 | 774k | s &= (1 << extra_bit) - 1; | 171 | 774k | } | 172 | | | 173 | | // remainder | 174 | 24.4k | int byte = tail_count * bit_width; // How many bits are left to store | 175 | 24.4k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 176 | 24.4k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 177 | | | 178 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 179 | 61.3k | for (int i = 0; i < tail_count; i++) { | 180 | 36.8k | s <<= bit_width; | 181 | 36.8k | s |= (input[i + full_batch_size]); | 182 | 36.8k | } | 183 | | | 184 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 185 | 24.4k | s <<= (bytes << 3) - byte; | 186 | | | 187 | | // Starting with the highest valid bit, take out 8 bits in sequence | 188 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 189 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 190 | 148k | for (int i = 0; i < bytes; i++) { | 191 | 124k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 192 | 124k | } | 193 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4IlEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4InEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4IlEEvPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4InEEvPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4IlEEvPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4InEEvPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4IlEEvPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4InEEvPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4IlEEvPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4InEEvPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4IlEEvPKohiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4InEEvPKohiPh |
194 | | |
195 | | template <typename T> |
196 | 181k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
197 | 181k | int output_mask = 255; |
198 | 181k | int need_bit = 0; // still need |
199 | | |
200 | 21.9M | for (int i = 0; i < in_num; i++) { |
201 | 21.7M | T x = input[i]; |
202 | 21.7M | int width = bit_width; |
203 | 21.7M | if (need_bit) { |
204 | | // The last time we take away the high 8 - need_bit, |
205 | | // we need to make up the rest of the need_bit from the width. |
206 | | // Use width - need_bit to compute high need_bit bits |
207 | 15.0M | *output |= x >> (width - need_bit); |
208 | 15.0M | output++; |
209 | | // There are need_bit bits being used, so subtract |
210 | 15.0M | width -= need_bit; |
211 | 15.0M | } |
212 | 21.7M | int num = width >> 3; // How many outputs can be processed at a time |
213 | 21.7M | int remainder = width & 7; // How many bits are left to store |
214 | | |
215 | | // Starting with the highest valid bit, take out 8 bits in sequence |
216 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
217 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end |
218 | | // But since there are still remainder bits that can't be processed, need to add the remainder |
219 | 223M | for (int j = 0; j < num; j++) { |
220 | 202M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); |
221 | 202M | output++; |
222 | 202M | } |
223 | 21.7M | if (remainder) { |
224 | | // Process the last remaining remainder bit. |
225 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. |
226 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit |
227 | 15.1M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); |
228 | | // Already have remainder bits, next time need 8-remainder bits |
229 | 15.1M | need_bit = 8 - remainder; |
230 | 15.1M | } else { |
231 | 6.57M | need_bit = 0; |
232 | 6.57M | } |
233 | 21.7M | } |
234 | 181k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_1EPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_1EPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_1EPKihiPh _ZN5doris10ForEncoderIlE10bit_pack_1EPKlhiPh Line | Count | Source | 196 | 12.1k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 197 | 12.1k | int output_mask = 255; | 198 | 12.1k | int need_bit = 0; // still need | 199 | | | 200 | 1.05M | for (int i = 0; i < in_num; i++) { | 201 | 1.04M | T x = input[i]; | 202 | 1.04M | int width = bit_width; | 203 | 1.04M | if (need_bit) { | 204 | | // The last time we take away the high 8 - need_bit, | 205 | | // we need to make up the rest of the need_bit from the width. | 206 | | // Use width - need_bit to compute high need_bit bits | 207 | 743k | *output |= x >> (width - need_bit); | 208 | 743k | output++; | 209 | | // There are need_bit bits being used, so subtract | 210 | 743k | width -= need_bit; | 211 | 743k | } | 212 | 1.04M | int num = width >> 3; // How many outputs can be processed at a time | 213 | 1.04M | int remainder = width & 7; // How many bits are left to store | 214 | | | 215 | | // Starting with the highest valid bit, take out 8 bits in sequence | 216 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 217 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end | 218 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 219 | 6.62M | for (int j = 0; j < num; j++) { | 220 | 5.58M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); | 221 | 5.58M | output++; | 222 | 5.58M | } | 223 | 1.04M | if (remainder) { | 224 | | // Process the last remaining remainder bit. | 225 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. | 226 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit | 227 | 749k | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); | 228 | | // Already have remainder bits, next time need 8-remainder bits | 229 | 749k | need_bit = 8 - remainder; | 230 | 749k | } else { | 231 | 294k | need_bit = 0; | 232 | 294k | } | 233 | 1.04M | } | 234 | 12.1k | } |
_ZN5doris10ForEncoderInE10bit_pack_1EPKnhiPh Line | Count | Source | 196 | 169k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 197 | 169k | int output_mask = 255; | 198 | 169k | int need_bit = 0; // still need | 199 | | | 200 | 20.8M | for (int i = 0; i < in_num; i++) { | 201 | 20.6M | T x = input[i]; | 202 | 20.6M | int width = bit_width; | 203 | 20.6M | if (need_bit) { | 204 | | // The last time we take away the high 8 - need_bit, | 205 | | // we need to make up the rest of the need_bit from the width. | 206 | | // Use width - need_bit to compute high need_bit bits | 207 | 14.3M | *output |= x >> (width - need_bit); | 208 | 14.3M | output++; | 209 | | // There are need_bit bits being used, so subtract | 210 | 14.3M | width -= need_bit; | 211 | 14.3M | } | 212 | 20.6M | int num = width >> 3; // How many outputs can be processed at a time | 213 | 20.6M | int remainder = width & 7; // How many bits are left to store | 214 | | | 215 | | // Starting with the highest valid bit, take out 8 bits in sequence | 216 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 217 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end | 218 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 219 | 217M | for (int j = 0; j < num; j++) { | 220 | 196M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); | 221 | 196M | output++; | 222 | 196M | } | 223 | 20.6M | if (remainder) { | 224 | | // Process the last remaining remainder bit. | 225 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. | 226 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit | 227 | 14.4M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); | 228 | | // Already have remainder bits, next time need 8-remainder bits | 229 | 14.4M | need_bit = 8 - remainder; | 230 | 14.4M | } else { | 231 | 6.27M | need_bit = 0; | 232 | 6.27M | } | 233 | 20.6M | } | 234 | 169k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_1EPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_1EPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_1EPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_1EPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_1EPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_1EPKohiPh |
235 | | |
236 | | // Use as few bit as possible to store a piece of integer data. |
237 | | // param[in] input: the integer list need to pack |
238 | | // param[in] in_num: the number integer need to pack |
239 | | // param[in] bit_width: how many bit we use to store each integer data |
240 | | // param[out] out: the packed result |
241 | | |
242 | | // For example: |
243 | | // The input is int32 list: 1, 2, 4, 8 and bit_width is 4 |
244 | | // The output will be: 0001 0010 0100 1000 |
245 | | template <typename T> |
246 | 243k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
247 | 243k | if (in_num == 0 || bit_width == 0) { |
248 | 260 | return; |
249 | 260 | } |
250 | | /* |
251 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 |
252 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 |
253 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 |
254 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) |
255 | | bit_width <= 64 : pack_1 > pack_4 |
256 | | */ |
257 | 242k | if (bit_width <= 8) { |
258 | 15.3k | bit_pack_8(input, in_num, bit_width, output); |
259 | 227k | } else if (bit_width <= 16) { |
260 | 15.2k | bit_pack_4<int64_t>(input, in_num, bit_width, output); |
261 | 212k | } else if (bit_width <= 32) { |
262 | 30.5k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); |
263 | 181k | } else { |
264 | 181k | bit_pack_1(input, in_num, bit_width, output); |
265 | 181k | } |
266 | 242k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE8bit_packEPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE8bit_packEPKshiPh _ZN5doris10ForEncoderIiE8bit_packEPKihiPh Line | Count | Source | 246 | 9 | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 247 | 9 | if (in_num == 0 || bit_width == 0) { | 248 | 1 | return; | 249 | 1 | } | 250 | | /* | 251 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 252 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 253 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 254 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 255 | | bit_width <= 64 : pack_1 > pack_4 | 256 | | */ | 257 | 8 | if (bit_width <= 8) { | 258 | 8 | bit_pack_8(input, in_num, bit_width, output); | 259 | 8 | } else if (bit_width <= 16) { | 260 | 0 | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 261 | 0 | } else if (bit_width <= 32) { | 262 | 0 | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 263 | 0 | } else { | 264 | 0 | bit_pack_1(input, in_num, bit_width, output); | 265 | 0 | } | 266 | 8 | } |
_ZN5doris10ForEncoderIlE8bit_packEPKlhiPh Line | Count | Source | 246 | 24.4k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 247 | 24.4k | if (in_num == 0 || bit_width == 0) { | 248 | 131 | return; | 249 | 131 | } | 250 | | /* | 251 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 252 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 253 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 254 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 255 | | bit_width <= 64 : pack_1 > pack_4 | 256 | | */ | 257 | 24.3k | if (bit_width <= 8) { | 258 | 3.05k | bit_pack_8(input, in_num, bit_width, output); | 259 | 21.2k | } else if (bit_width <= 16) { | 260 | 3.03k | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 261 | 18.2k | } else if (bit_width <= 32) { | 262 | 6.08k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 263 | 12.1k | } else { | 264 | 12.1k | bit_pack_1(input, in_num, bit_width, output); | 265 | 12.1k | } | 266 | 24.3k | } |
_ZN5doris10ForEncoderInE8bit_packEPKnhiPh Line | Count | Source | 246 | 218k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 247 | 218k | if (in_num == 0 || bit_width == 0) { | 248 | 128 | return; | 249 | 128 | } | 250 | | /* | 251 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 252 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 253 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 254 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 255 | | bit_width <= 64 : pack_1 > pack_4 | 256 | | */ | 257 | 218k | if (bit_width <= 8) { | 258 | 12.2k | bit_pack_8(input, in_num, bit_width, output); | 259 | 206k | } else if (bit_width <= 16) { | 260 | 12.2k | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 261 | 194k | } else if (bit_width <= 32) { | 262 | 24.4k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 263 | 169k | } else { | 264 | 169k | bit_pack_1(input, in_num, bit_width, output); | 265 | 169k | } | 266 | 218k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE8bit_packEPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE8bit_packEPKthiPh _ZN5doris10ForEncoderIjE8bit_packEPKjhiPh Line | Count | Source | 246 | 6 | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 247 | 6 | if (in_num == 0 || bit_width == 0) { | 248 | 0 | return; | 249 | 0 | } | 250 | | /* | 251 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 252 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 253 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 254 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 255 | | bit_width <= 64 : pack_1 > pack_4 | 256 | | */ | 257 | 6 | if (bit_width <= 8) { | 258 | 6 | bit_pack_8(input, in_num, bit_width, output); | 259 | 6 | } else if (bit_width <= 16) { | 260 | 0 | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 261 | 0 | } else if (bit_width <= 32) { | 262 | 0 | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 263 | 0 | } else { | 264 | 0 | bit_pack_1(input, in_num, bit_width, output); | 265 | 0 | } | 266 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE8bit_packEPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE8bit_packEPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE8bit_packEPKohiPh |
267 | | |
268 | | template <typename T> |
269 | 48.9k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { |
270 | 48.9k | T min = input[0]; |
271 | 48.9k | T max = input[0]; |
272 | 48.9k | bool is_ascending = true; |
273 | 48.9k | uint8_t bit_width = 0; |
274 | 48.9k | T half_max_delta = numeric_limits_max() >> 1; |
275 | 48.9k | bool is_keep_original_value = false; |
276 | | |
277 | | // 1. make sure order_flag, save_original_value, and find max&min. |
278 | 4.18M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { |
279 | 4.13M | if (is_ascending) { |
280 | 86.4k | if (input[i] < input[i - 1]) { |
281 | 48.4k | is_ascending = false; |
282 | 48.4k | } else { |
283 | 38.0k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow |
284 | 0 | is_keep_original_value = true; |
285 | 38.0k | } else { |
286 | 38.0k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); |
287 | 38.0k | } |
288 | 38.0k | } |
289 | 86.4k | } |
290 | | |
291 | 4.13M | if (input[i] < min) { |
292 | 180k | min = input[i]; |
293 | 180k | continue; |
294 | 180k | } |
295 | | |
296 | 3.95M | if (input[i] > max) { |
297 | 183k | max = input[i]; |
298 | 183k | } |
299 | 3.95M | } |
300 | 48.9k | if (!is_ascending) { |
301 | 48.4k | if ((max >> 1) - (min >> 1) > half_max_delta) { |
302 | 0 | is_keep_original_value = true; |
303 | 0 | } |
304 | 48.4k | } |
305 | | |
306 | | // 2. save min value. |
307 | 48.9k | if (sizeof(T) == 16) { |
308 | 24.4k | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); |
309 | 24.4k | } else if (sizeof(T) == 8) { |
310 | 24.4k | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); |
311 | 24.4k | } else { |
312 | 15 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); |
313 | 15 | } |
314 | | |
315 | | // 3.1 save original value. |
316 | 48.9k | if (is_keep_original_value) { |
317 | 0 | bit_width = sizeof(T) * 8; |
318 | 0 | uint32_t len = _buffered_values_num * bit_width; |
319 | 0 | _buffer->reserve(_buffer->size() + len); |
320 | 0 | size_t origin_size = _buffer->size(); |
321 | 0 | _buffer->resize(origin_size + len); |
322 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); |
323 | 48.9k | } else { |
324 | | // 3.2 bit pack. |
325 | | // improve for ascending order input, we could use fewer bit |
326 | 48.9k | T delta_values[FRAME_VALUE_NUM]; |
327 | 48.9k | if (is_ascending) { |
328 | 449 | delta_values[0] = 0; |
329 | 3.17k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { |
330 | 2.72k | delta_values[i] = input[i] - input[i - 1]; |
331 | 2.72k | } |
332 | 48.4k | } else { |
333 | 48.4k | bit_width = bits(static_cast<T>(max - min)); |
334 | 4.22M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { |
335 | 4.17M | delta_values[i] = input[i] - min; |
336 | 4.17M | } |
337 | 48.4k | } |
338 | | |
339 | 48.9k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); |
340 | | |
341 | 48.9k | _buffer->reserve(_buffer->size() + packing_len); |
342 | 48.9k | size_t origin_size = _buffer->size(); |
343 | 48.9k | _buffer->resize(origin_size + packing_len); |
344 | 48.9k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); |
345 | 48.9k | } |
346 | 48.9k | uint8_t storage_format = 0; |
347 | 48.9k | if (is_keep_original_value) { |
348 | 0 | storage_format = 2; |
349 | 48.9k | } else if (is_ascending) { |
350 | 449 | storage_format = 1; |
351 | 449 | } |
352 | 48.9k | _storage_formats.push_back(storage_format); |
353 | 48.9k | _bit_widths.push_back(bit_width); |
354 | | |
355 | 48.9k | _buffered_values_num = 0; |
356 | 48.9k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE27bit_packing_one_frame_valueEPKa Unexecuted instantiation: _ZN5doris10ForEncoderIsE27bit_packing_one_frame_valueEPKs _ZN5doris10ForEncoderIiE27bit_packing_one_frame_valueEPKi Line | Count | Source | 269 | 9 | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 270 | 9 | T min = input[0]; | 271 | 9 | T max = input[0]; | 272 | 9 | bool is_ascending = true; | 273 | 9 | uint8_t bit_width = 0; | 274 | 9 | T half_max_delta = numeric_limits_max() >> 1; | 275 | 9 | bool is_keep_original_value = false; | 276 | | | 277 | | // 1. make sure order_flag, save_original_value, and find max&min. | 278 | 771 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 279 | 762 | if (is_ascending) { | 280 | 762 | if (input[i] < input[i - 1]) { | 281 | 0 | is_ascending = false; | 282 | 762 | } else { | 283 | 762 | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 284 | 0 | is_keep_original_value = true; | 285 | 762 | } else { | 286 | 762 | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 287 | 762 | } | 288 | 762 | } | 289 | 762 | } | 290 | | | 291 | 762 | if (input[i] < min) { | 292 | 0 | min = input[i]; | 293 | 0 | continue; | 294 | 0 | } | 295 | | | 296 | 762 | if (input[i] > max) { | 297 | 762 | max = input[i]; | 298 | 762 | } | 299 | 762 | } | 300 | 9 | if (!is_ascending) { | 301 | 0 | if ((max >> 1) - (min >> 1) > half_max_delta) { | 302 | 0 | is_keep_original_value = true; | 303 | 0 | } | 304 | 0 | } | 305 | | | 306 | | // 2. save min value. | 307 | 9 | if (sizeof(T) == 16) { | 308 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 309 | 9 | } else if (sizeof(T) == 8) { | 310 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 311 | 9 | } else { | 312 | 9 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 313 | 9 | } | 314 | | | 315 | | // 3.1 save original value. | 316 | 9 | if (is_keep_original_value) { | 317 | 0 | bit_width = sizeof(T) * 8; | 318 | 0 | uint32_t len = _buffered_values_num * bit_width; | 319 | 0 | _buffer->reserve(_buffer->size() + len); | 320 | 0 | size_t origin_size = _buffer->size(); | 321 | 0 | _buffer->resize(origin_size + len); | 322 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 323 | 9 | } else { | 324 | | // 3.2 bit pack. | 325 | | // improve for ascending order input, we could use fewer bit | 326 | 9 | T delta_values[FRAME_VALUE_NUM]; | 327 | 9 | if (is_ascending) { | 328 | 9 | delta_values[0] = 0; | 329 | 771 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 330 | 762 | delta_values[i] = input[i] - input[i - 1]; | 331 | 762 | } | 332 | 9 | } else { | 333 | 0 | bit_width = bits(static_cast<T>(max - min)); | 334 | 0 | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 335 | 0 | delta_values[i] = input[i] - min; | 336 | 0 | } | 337 | 0 | } | 338 | | | 339 | 9 | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 340 | | | 341 | 9 | _buffer->reserve(_buffer->size() + packing_len); | 342 | 9 | size_t origin_size = _buffer->size(); | 343 | 9 | _buffer->resize(origin_size + packing_len); | 344 | 9 | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 345 | 9 | } | 346 | 9 | uint8_t storage_format = 0; | 347 | 9 | if (is_keep_original_value) { | 348 | 0 | storage_format = 2; | 349 | 9 | } else if (is_ascending) { | 350 | 9 | storage_format = 1; | 351 | 9 | } | 352 | 9 | _storage_formats.push_back(storage_format); | 353 | 9 | _bit_widths.push_back(bit_width); | 354 | | | 355 | 9 | _buffered_values_num = 0; | 356 | 9 | } |
_ZN5doris10ForEncoderIlE27bit_packing_one_frame_valueEPKl Line | Count | Source | 269 | 24.4k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 270 | 24.4k | T min = input[0]; | 271 | 24.4k | T max = input[0]; | 272 | 24.4k | bool is_ascending = true; | 273 | 24.4k | uint8_t bit_width = 0; | 274 | 24.4k | T half_max_delta = numeric_limits_max() >> 1; | 275 | 24.4k | bool is_keep_original_value = false; | 276 | | | 277 | | // 1. make sure order_flag, save_original_value, and find max&min. | 278 | 2.08M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 279 | 2.06M | if (is_ascending) { | 280 | 43.3k | if (input[i] < input[i - 1]) { | 281 | 24.2k | is_ascending = false; | 282 | 24.2k | } else { | 283 | 19.1k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 284 | 0 | is_keep_original_value = true; | 285 | 19.1k | } else { | 286 | 19.1k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 287 | 19.1k | } | 288 | 19.1k | } | 289 | 43.3k | } | 290 | | | 291 | 2.06M | if (input[i] < min) { | 292 | 88.0k | min = input[i]; | 293 | 88.0k | continue; | 294 | 88.0k | } | 295 | | | 296 | 1.97M | if (input[i] > max) { | 297 | 89.5k | max = input[i]; | 298 | 89.5k | } | 299 | 1.97M | } | 300 | 24.4k | if (!is_ascending) { | 301 | 24.2k | if ((max >> 1) - (min >> 1) > half_max_delta) { | 302 | 0 | is_keep_original_value = true; | 303 | 0 | } | 304 | 24.2k | } | 305 | | | 306 | | // 2. save min value. | 307 | 24.4k | if (sizeof(T) == 16) { | 308 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 309 | 24.4k | } else if (sizeof(T) == 8) { | 310 | 24.4k | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 311 | 24.4k | } else { | 312 | 0 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 313 | 0 | } | 314 | | | 315 | | // 3.1 save original value. | 316 | 24.4k | if (is_keep_original_value) { | 317 | 0 | bit_width = sizeof(T) * 8; | 318 | 0 | uint32_t len = _buffered_values_num * bit_width; | 319 | 0 | _buffer->reserve(_buffer->size() + len); | 320 | 0 | size_t origin_size = _buffer->size(); | 321 | 0 | _buffer->resize(origin_size + len); | 322 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 323 | 24.4k | } else { | 324 | | // 3.2 bit pack. | 325 | | // improve for ascending order input, we could use fewer bit | 326 | 24.4k | T delta_values[FRAME_VALUE_NUM]; | 327 | 24.4k | if (is_ascending) { | 328 | 220 | delta_values[0] = 0; | 329 | 1.29k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 330 | 1.07k | delta_values[i] = input[i] - input[i - 1]; | 331 | 1.07k | } | 332 | 24.2k | } else { | 333 | 24.2k | bit_width = bits(static_cast<T>(max - min)); | 334 | 2.11M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 335 | 2.08M | delta_values[i] = input[i] - min; | 336 | 2.08M | } | 337 | 24.2k | } | 338 | | | 339 | 24.4k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 340 | | | 341 | 24.4k | _buffer->reserve(_buffer->size() + packing_len); | 342 | 24.4k | size_t origin_size = _buffer->size(); | 343 | 24.4k | _buffer->resize(origin_size + packing_len); | 344 | 24.4k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 345 | 24.4k | } | 346 | 24.4k | uint8_t storage_format = 0; | 347 | 24.4k | if (is_keep_original_value) { | 348 | 0 | storage_format = 2; | 349 | 24.4k | } else if (is_ascending) { | 350 | 220 | storage_format = 1; | 351 | 220 | } | 352 | 24.4k | _storage_formats.push_back(storage_format); | 353 | 24.4k | _bit_widths.push_back(bit_width); | 354 | | | 355 | 24.4k | _buffered_values_num = 0; | 356 | 24.4k | } |
_ZN5doris10ForEncoderInE27bit_packing_one_frame_valueEPKn Line | Count | Source | 269 | 24.4k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 270 | 24.4k | T min = input[0]; | 271 | 24.4k | T max = input[0]; | 272 | 24.4k | bool is_ascending = true; | 273 | 24.4k | uint8_t bit_width = 0; | 274 | 24.4k | T half_max_delta = numeric_limits_max() >> 1; | 275 | 24.4k | bool is_keep_original_value = false; | 276 | | | 277 | | // 1. make sure order_flag, save_original_value, and find max&min. | 278 | 2.08M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 279 | 2.06M | if (is_ascending) { | 280 | 41.5k | if (input[i] < input[i - 1]) { | 281 | 24.2k | is_ascending = false; | 282 | 24.2k | } else { | 283 | 17.3k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 284 | 0 | is_keep_original_value = true; | 285 | 17.3k | } else { | 286 | 17.3k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 287 | 17.3k | } | 288 | 17.3k | } | 289 | 41.5k | } | 290 | | | 291 | 2.06M | if (input[i] < min) { | 292 | 92.7k | min = input[i]; | 293 | 92.7k | continue; | 294 | 92.7k | } | 295 | | | 296 | 1.97M | if (input[i] > max) { | 297 | 92.5k | max = input[i]; | 298 | 92.5k | } | 299 | 1.97M | } | 300 | 24.4k | if (!is_ascending) { | 301 | 24.2k | if ((max >> 1) - (min >> 1) > half_max_delta) { | 302 | 0 | is_keep_original_value = true; | 303 | 0 | } | 304 | 24.2k | } | 305 | | | 306 | | // 2. save min value. | 307 | 24.4k | if (sizeof(T) == 16) { | 308 | 24.4k | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 309 | 24.4k | } else if (sizeof(T) == 8) { | 310 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 311 | 0 | } else { | 312 | 0 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 313 | 0 | } | 314 | | | 315 | | // 3.1 save original value. | 316 | 24.4k | if (is_keep_original_value) { | 317 | 0 | bit_width = sizeof(T) * 8; | 318 | 0 | uint32_t len = _buffered_values_num * bit_width; | 319 | 0 | _buffer->reserve(_buffer->size() + len); | 320 | 0 | size_t origin_size = _buffer->size(); | 321 | 0 | _buffer->resize(origin_size + len); | 322 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 323 | 24.4k | } else { | 324 | | // 3.2 bit pack. | 325 | | // improve for ascending order input, we could use fewer bit | 326 | 24.4k | T delta_values[FRAME_VALUE_NUM]; | 327 | 24.4k | if (is_ascending) { | 328 | 214 | delta_values[0] = 0; | 329 | 338 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 330 | 124 | delta_values[i] = input[i] - input[i - 1]; | 331 | 124 | } | 332 | 24.2k | } else { | 333 | 24.2k | bit_width = bits(static_cast<T>(max - min)); | 334 | 2.11M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 335 | 2.08M | delta_values[i] = input[i] - min; | 336 | 2.08M | } | 337 | 24.2k | } | 338 | | | 339 | 24.4k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 340 | | | 341 | 24.4k | _buffer->reserve(_buffer->size() + packing_len); | 342 | 24.4k | size_t origin_size = _buffer->size(); | 343 | 24.4k | _buffer->resize(origin_size + packing_len); | 344 | 24.4k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 345 | 24.4k | } | 346 | 24.4k | uint8_t storage_format = 0; | 347 | 24.4k | if (is_keep_original_value) { | 348 | 0 | storage_format = 2; | 349 | 24.4k | } else if (is_ascending) { | 350 | 214 | storage_format = 1; | 351 | 214 | } | 352 | 24.4k | _storage_formats.push_back(storage_format); | 353 | 24.4k | _bit_widths.push_back(bit_width); | 354 | | | 355 | 24.4k | _buffered_values_num = 0; | 356 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE27bit_packing_one_frame_valueEPKh Unexecuted instantiation: _ZN5doris10ForEncoderItE27bit_packing_one_frame_valueEPKt _ZN5doris10ForEncoderIjE27bit_packing_one_frame_valueEPKj Line | Count | Source | 269 | 6 | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 270 | 6 | T min = input[0]; | 271 | 6 | T max = input[0]; | 272 | 6 | bool is_ascending = true; | 273 | 6 | uint8_t bit_width = 0; | 274 | 6 | T half_max_delta = numeric_limits_max() >> 1; | 275 | 6 | bool is_keep_original_value = false; | 276 | | | 277 | | // 1. make sure order_flag, save_original_value, and find max&min. | 278 | 768 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 279 | 762 | if (is_ascending) { | 280 | 762 | if (input[i] < input[i - 1]) { | 281 | 0 | is_ascending = false; | 282 | 762 | } else { | 283 | 762 | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 284 | 0 | is_keep_original_value = true; | 285 | 762 | } else { | 286 | 762 | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 287 | 762 | } | 288 | 762 | } | 289 | 762 | } | 290 | | | 291 | 762 | if (input[i] < min) { | 292 | 0 | min = input[i]; | 293 | 0 | continue; | 294 | 0 | } | 295 | | | 296 | 762 | if (input[i] > max) { | 297 | 762 | max = input[i]; | 298 | 762 | } | 299 | 762 | } | 300 | 6 | if (!is_ascending) { | 301 | 0 | if ((max >> 1) - (min >> 1) > half_max_delta) { | 302 | 0 | is_keep_original_value = true; | 303 | 0 | } | 304 | 0 | } | 305 | | | 306 | | // 2. save min value. | 307 | 6 | if (sizeof(T) == 16) { | 308 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 309 | 6 | } else if (sizeof(T) == 8) { | 310 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 311 | 6 | } else { | 312 | 6 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 313 | 6 | } | 314 | | | 315 | | // 3.1 save original value. | 316 | 6 | if (is_keep_original_value) { | 317 | 0 | bit_width = sizeof(T) * 8; | 318 | 0 | uint32_t len = _buffered_values_num * bit_width; | 319 | 0 | _buffer->reserve(_buffer->size() + len); | 320 | 0 | size_t origin_size = _buffer->size(); | 321 | 0 | _buffer->resize(origin_size + len); | 322 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 323 | 6 | } else { | 324 | | // 3.2 bit pack. | 325 | | // improve for ascending order input, we could use fewer bit | 326 | 6 | T delta_values[FRAME_VALUE_NUM]; | 327 | 6 | if (is_ascending) { | 328 | 6 | delta_values[0] = 0; | 329 | 768 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 330 | 762 | delta_values[i] = input[i] - input[i - 1]; | 331 | 762 | } | 332 | 6 | } else { | 333 | 0 | bit_width = bits(static_cast<T>(max - min)); | 334 | 0 | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 335 | 0 | delta_values[i] = input[i] - min; | 336 | 0 | } | 337 | 0 | } | 338 | | | 339 | 6 | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 340 | | | 341 | 6 | _buffer->reserve(_buffer->size() + packing_len); | 342 | 6 | size_t origin_size = _buffer->size(); | 343 | 6 | _buffer->resize(origin_size + packing_len); | 344 | 6 | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 345 | 6 | } | 346 | 6 | uint8_t storage_format = 0; | 347 | 6 | if (is_keep_original_value) { | 348 | 0 | storage_format = 2; | 349 | 6 | } else if (is_ascending) { | 350 | 6 | storage_format = 1; | 351 | 6 | } | 352 | 6 | _storage_formats.push_back(storage_format); | 353 | 6 | _bit_widths.push_back(bit_width); | 354 | | | 355 | 6 | _buffered_values_num = 0; | 356 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE27bit_packing_one_frame_valueEPKm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE27bit_packing_one_frame_valueEPKS1_ Unexecuted instantiation: _ZN5doris10ForEncoderIoE27bit_packing_one_frame_valueEPKo |
357 | | |
358 | | template <typename T> |
359 | 32.6k | uint32_t ForEncoder<T>::flush() { |
360 | 32.6k | if (_buffered_values_num != 0) { |
361 | 32.5k | bit_packing_one_frame_value(_buffered_values); |
362 | 32.5k | } |
363 | | |
364 | | // write the footer: |
365 | | // 1 _storage_formats and bit_widths |
366 | 32.6k | DCHECK(_storage_formats.size() == _bit_widths.size()) |
367 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; |
368 | 81.5k | for (size_t i = 0; i < _storage_formats.size(); i++) { |
369 | 48.9k | _buffer->append(&_storage_formats[i], 1); |
370 | 48.9k | _buffer->append(&_bit_widths[i], 1); |
371 | 48.9k | } |
372 | | // 2 frame_value_num and values_num |
373 | 32.6k | uint8_t frame_value_num = FRAME_VALUE_NUM; |
374 | 32.6k | _buffer->append(&frame_value_num, 1); |
375 | 32.6k | put_fixed32_le(_buffer, _values_num); |
376 | | |
377 | 32.6k | return cast_set<uint32_t>(_buffer->size()); |
378 | 32.6k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderIsE5flushEv _ZN5doris10ForEncoderIiE5flushEv Line | Count | Source | 359 | 7 | uint32_t ForEncoder<T>::flush() { | 360 | 7 | if (_buffered_values_num != 0) { | 361 | 4 | bit_packing_one_frame_value(_buffered_values); | 362 | 4 | } | 363 | | | 364 | | // write the footer: | 365 | | // 1 _storage_formats and bit_widths | 366 | 7 | DCHECK(_storage_formats.size() == _bit_widths.size()) | 367 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 368 | 16 | for (size_t i = 0; i < _storage_formats.size(); i++) { | 369 | 9 | _buffer->append(&_storage_formats[i], 1); | 370 | 9 | _buffer->append(&_bit_widths[i], 1); | 371 | 9 | } | 372 | | // 2 frame_value_num and values_num | 373 | 7 | uint8_t frame_value_num = FRAME_VALUE_NUM; | 374 | 7 | _buffer->append(&frame_value_num, 1); | 375 | 7 | put_fixed32_le(_buffer, _values_num); | 376 | | | 377 | 7 | return cast_set<uint32_t>(_buffer->size()); | 378 | 7 | } |
_ZN5doris10ForEncoderIlE5flushEv Line | Count | Source | 359 | 16.3k | uint32_t ForEncoder<T>::flush() { | 360 | 16.3k | if (_buffered_values_num != 0) { | 361 | 16.2k | bit_packing_one_frame_value(_buffered_values); | 362 | 16.2k | } | 363 | | | 364 | | // write the footer: | 365 | | // 1 _storage_formats and bit_widths | 366 | 16.3k | DCHECK(_storage_formats.size() == _bit_widths.size()) | 367 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 368 | 40.7k | for (size_t i = 0; i < _storage_formats.size(); i++) { | 369 | 24.4k | _buffer->append(&_storage_formats[i], 1); | 370 | 24.4k | _buffer->append(&_bit_widths[i], 1); | 371 | 24.4k | } | 372 | | // 2 frame_value_num and values_num | 373 | 16.3k | uint8_t frame_value_num = FRAME_VALUE_NUM; | 374 | 16.3k | _buffer->append(&frame_value_num, 1); | 375 | 16.3k | put_fixed32_le(_buffer, _values_num); | 376 | | | 377 | 16.3k | return cast_set<uint32_t>(_buffer->size()); | 378 | 16.3k | } |
_ZN5doris10ForEncoderInE5flushEv Line | Count | Source | 359 | 16.3k | uint32_t ForEncoder<T>::flush() { | 360 | 16.3k | if (_buffered_values_num != 0) { | 361 | 16.2k | bit_packing_one_frame_value(_buffered_values); | 362 | 16.2k | } | 363 | | | 364 | | // write the footer: | 365 | | // 1 _storage_formats and bit_widths | 366 | 16.3k | DCHECK(_storage_formats.size() == _bit_widths.size()) | 367 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 368 | 40.7k | for (size_t i = 0; i < _storage_formats.size(); i++) { | 369 | 24.4k | _buffer->append(&_storage_formats[i], 1); | 370 | 24.4k | _buffer->append(&_bit_widths[i], 1); | 371 | 24.4k | } | 372 | | // 2 frame_value_num and values_num | 373 | 16.3k | uint8_t frame_value_num = FRAME_VALUE_NUM; | 374 | 16.3k | _buffer->append(&frame_value_num, 1); | 375 | 16.3k | put_fixed32_le(_buffer, _values_num); | 376 | | | 377 | 16.3k | return cast_set<uint32_t>(_buffer->size()); | 378 | 16.3k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderItE5flushEv _ZN5doris10ForEncoderIjE5flushEv Line | Count | Source | 359 | 3 | uint32_t ForEncoder<T>::flush() { | 360 | 3 | if (_buffered_values_num != 0) { | 361 | 0 | bit_packing_one_frame_value(_buffered_values); | 362 | 0 | } | 363 | | | 364 | | // write the footer: | 365 | | // 1 _storage_formats and bit_widths | 366 | 3 | DCHECK(_storage_formats.size() == _bit_widths.size()) | 367 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 368 | 9 | for (size_t i = 0; i < _storage_formats.size(); i++) { | 369 | 6 | _buffer->append(&_storage_formats[i], 1); | 370 | 6 | _buffer->append(&_bit_widths[i], 1); | 371 | 6 | } | 372 | | // 2 frame_value_num and values_num | 373 | 3 | uint8_t frame_value_num = FRAME_VALUE_NUM; | 374 | 3 | _buffer->append(&frame_value_num, 1); | 375 | 3 | put_fixed32_le(_buffer, _values_num); | 376 | | | 377 | 3 | return cast_set<uint32_t>(_buffer->size()); | 378 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderIoE5flushEv |
379 | | |
380 | | template <typename T> |
381 | 48.9k | const T ForEncoder<T>::numeric_limits_max() { |
382 | 48.9k | return std::numeric_limits<T>::max(); |
383 | 48.9k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderIsE18numeric_limits_maxEv _ZN5doris10ForEncoderIiE18numeric_limits_maxEv Line | Count | Source | 381 | 9 | const T ForEncoder<T>::numeric_limits_max() { | 382 | 9 | return std::numeric_limits<T>::max(); | 383 | 9 | } |
_ZN5doris10ForEncoderIlE18numeric_limits_maxEv Line | Count | Source | 381 | 24.4k | const T ForEncoder<T>::numeric_limits_max() { | 382 | 24.4k | return std::numeric_limits<T>::max(); | 383 | 24.4k | } |
_ZN5doris10ForEncoderInE18numeric_limits_maxEv Line | Count | Source | 381 | 24.4k | const T ForEncoder<T>::numeric_limits_max() { | 382 | 24.4k | return std::numeric_limits<T>::max(); | 383 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderItE18numeric_limits_maxEv _ZN5doris10ForEncoderIjE18numeric_limits_maxEv Line | Count | Source | 381 | 6 | const T ForEncoder<T>::numeric_limits_max() { | 382 | 6 | return std::numeric_limits<T>::max(); | 383 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderIoE18numeric_limits_maxEv |
384 | | |
385 | | template <> |
386 | 0 | const uint24_t ForEncoder<uint24_t>::numeric_limits_max() { |
387 | 0 | return 0XFFFFFF; |
388 | 0 | } |
389 | | |
390 | | template <typename T> |
391 | 32.6k | bool ForDecoder<T>::init() { |
392 | | // When row count is zero, the minimum footer size is 5: |
393 | | // only has ValuesNum(4) + FrameValueNum(1) |
394 | 32.6k | if (_buffer_len < 5) { |
395 | 0 | return false; |
396 | 0 | } |
397 | | |
398 | 32.6k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); |
399 | 32.6k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); |
400 | 32.6k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); |
401 | 32.6k | _last_frame_size = |
402 | 32.6k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); |
403 | | |
404 | 32.6k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; |
405 | | |
406 | | // read _storage_formats, bit_widths and compute frame_offsets |
407 | 32.6k | u_int32_t frame_start_offset = 0; |
408 | 81.5k | for (uint32_t i = 0; i < _frame_count; i++) { |
409 | 48.9k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); |
410 | 48.9k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); |
411 | 48.9k | _bit_widths.push_back(bit_width); |
412 | 48.9k | _storage_formats.push_back(order_flag); |
413 | | |
414 | 48.9k | bit_width_offset += 2; |
415 | | |
416 | 48.9k | _frame_offsets.push_back(frame_start_offset); |
417 | 48.9k | if (sizeof(T) == 16) { |
418 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 16; |
419 | 24.4k | } else if (sizeof(T) == 8) { |
420 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 8; |
421 | 24.4k | } else { |
422 | 15 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; |
423 | 15 | } |
424 | 48.9k | } |
425 | | |
426 | 32.6k | _out_buffer.resize(_max_frame_size); |
427 | 32.6k | _parsed = true; |
428 | | |
429 | 32.6k | return true; |
430 | 32.6k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderIsE4initEv _ZN5doris10ForDecoderIiE4initEv Line | Count | Source | 391 | 7 | bool ForDecoder<T>::init() { | 392 | | // When row count is zero, the minimum footer size is 5: | 393 | | // only has ValuesNum(4) + FrameValueNum(1) | 394 | 7 | if (_buffer_len < 5) { | 395 | 0 | return false; | 396 | 0 | } | 397 | | | 398 | 7 | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 399 | 7 | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 400 | 7 | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 401 | 7 | _last_frame_size = | 402 | 7 | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 403 | | | 404 | 7 | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 405 | | | 406 | | // read _storage_formats, bit_widths and compute frame_offsets | 407 | 7 | u_int32_t frame_start_offset = 0; | 408 | 16 | for (uint32_t i = 0; i < _frame_count; i++) { | 409 | 9 | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 410 | 9 | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 411 | 9 | _bit_widths.push_back(bit_width); | 412 | 9 | _storage_formats.push_back(order_flag); | 413 | | | 414 | 9 | bit_width_offset += 2; | 415 | | | 416 | 9 | _frame_offsets.push_back(frame_start_offset); | 417 | 9 | if (sizeof(T) == 16) { | 418 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 419 | 9 | } else if (sizeof(T) == 8) { | 420 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 421 | 9 | } else { | 422 | 9 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 423 | 9 | } | 424 | 9 | } | 425 | | | 426 | 7 | _out_buffer.resize(_max_frame_size); | 427 | 7 | _parsed = true; | 428 | | | 429 | 7 | return true; | 430 | 7 | } |
_ZN5doris10ForDecoderIlE4initEv Line | Count | Source | 391 | 16.3k | bool ForDecoder<T>::init() { | 392 | | // When row count is zero, the minimum footer size is 5: | 393 | | // only has ValuesNum(4) + FrameValueNum(1) | 394 | 16.3k | if (_buffer_len < 5) { | 395 | 0 | return false; | 396 | 0 | } | 397 | | | 398 | 16.3k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 399 | 16.3k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 400 | 16.3k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 401 | 16.3k | _last_frame_size = | 402 | 16.3k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 403 | | | 404 | 16.3k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 405 | | | 406 | | // read _storage_formats, bit_widths and compute frame_offsets | 407 | 16.3k | u_int32_t frame_start_offset = 0; | 408 | 40.7k | for (uint32_t i = 0; i < _frame_count; i++) { | 409 | 24.4k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 410 | 24.4k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 411 | 24.4k | _bit_widths.push_back(bit_width); | 412 | 24.4k | _storage_formats.push_back(order_flag); | 413 | | | 414 | 24.4k | bit_width_offset += 2; | 415 | | | 416 | 24.4k | _frame_offsets.push_back(frame_start_offset); | 417 | 24.4k | if (sizeof(T) == 16) { | 418 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 419 | 24.4k | } else if (sizeof(T) == 8) { | 420 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 421 | 24.4k | } else { | 422 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 423 | 0 | } | 424 | 24.4k | } | 425 | | | 426 | 16.3k | _out_buffer.resize(_max_frame_size); | 427 | 16.3k | _parsed = true; | 428 | | | 429 | 16.3k | return true; | 430 | 16.3k | } |
_ZN5doris10ForDecoderInE4initEv Line | Count | Source | 391 | 16.3k | bool ForDecoder<T>::init() { | 392 | | // When row count is zero, the minimum footer size is 5: | 393 | | // only has ValuesNum(4) + FrameValueNum(1) | 394 | 16.3k | if (_buffer_len < 5) { | 395 | 0 | return false; | 396 | 0 | } | 397 | | | 398 | 16.3k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 399 | 16.3k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 400 | 16.3k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 401 | 16.3k | _last_frame_size = | 402 | 16.3k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 403 | | | 404 | 16.3k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 405 | | | 406 | | // read _storage_formats, bit_widths and compute frame_offsets | 407 | 16.3k | u_int32_t frame_start_offset = 0; | 408 | 40.7k | for (uint32_t i = 0; i < _frame_count; i++) { | 409 | 24.4k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 410 | 24.4k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 411 | 24.4k | _bit_widths.push_back(bit_width); | 412 | 24.4k | _storage_formats.push_back(order_flag); | 413 | | | 414 | 24.4k | bit_width_offset += 2; | 415 | | | 416 | 24.4k | _frame_offsets.push_back(frame_start_offset); | 417 | 24.4k | if (sizeof(T) == 16) { | 418 | 24.4k | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 419 | 24.4k | } else if (sizeof(T) == 8) { | 420 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 421 | 0 | } else { | 422 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 423 | 0 | } | 424 | 24.4k | } | 425 | | | 426 | 16.3k | _out_buffer.resize(_max_frame_size); | 427 | 16.3k | _parsed = true; | 428 | | | 429 | 16.3k | return true; | 430 | 16.3k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderItE4initEv _ZN5doris10ForDecoderIjE4initEv Line | Count | Source | 391 | 3 | bool ForDecoder<T>::init() { | 392 | | // When row count is zero, the minimum footer size is 5: | 393 | | // only has ValuesNum(4) + FrameValueNum(1) | 394 | 3 | if (_buffer_len < 5) { | 395 | 0 | return false; | 396 | 0 | } | 397 | | | 398 | 3 | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 399 | 3 | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 400 | 3 | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 401 | 3 | _last_frame_size = | 402 | 3 | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 403 | | | 404 | 3 | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 405 | | | 406 | | // read _storage_formats, bit_widths and compute frame_offsets | 407 | 3 | u_int32_t frame_start_offset = 0; | 408 | 9 | for (uint32_t i = 0; i < _frame_count; i++) { | 409 | 6 | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 410 | 6 | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 411 | 6 | _bit_widths.push_back(bit_width); | 412 | 6 | _storage_formats.push_back(order_flag); | 413 | | | 414 | 6 | bit_width_offset += 2; | 415 | | | 416 | 6 | _frame_offsets.push_back(frame_start_offset); | 417 | 6 | if (sizeof(T) == 16) { | 418 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 419 | 6 | } else if (sizeof(T) == 8) { | 420 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 421 | 6 | } else { | 422 | 6 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 423 | 6 | } | 424 | 6 | } | 425 | | | 426 | 3 | _out_buffer.resize(_max_frame_size); | 427 | 3 | _parsed = true; | 428 | | | 429 | 3 | return true; | 430 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderIoE4initEv |
431 | | |
432 | | // todo(kks): improve this method by SIMD instructions |
433 | | |
434 | | template <typename T> |
435 | | template <typename U> |
436 | | void ForDecoder<T>::bit_unpack_optimize(const uint8_t* input, uint8_t in_num, int bit_width, |
437 | 81.3k | T* output) { |
438 | 81.3k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, |
439 | 81.3k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); |
440 | 81.3k | constexpr int u_size = sizeof(U); // Size of U |
441 | 81.3k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) |
442 | 81.3k | int valid_bit = 0; // How many valid bits |
443 | 81.3k | int need_bit = 0; // still need |
444 | 81.3k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size |
445 | 81.3k | int full_batch_size = |
446 | 81.3k | cast_set<int>((input_size >> u_size_shift) |
447 | 81.3k | << u_size_shift); // Adjust input_size to a multiple of u_size |
448 | 81.3k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. |
449 | | // The number of bits in input to adjust to multiples of 8 and thus more |
450 | 81.3k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); |
451 | | |
452 | | // to ensure that only bit_width bits are valid |
453 | 81.3k | T output_mask; |
454 | 81.3k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { |
455 | 0 | output_mask = static_cast<T>(~T(0)); |
456 | 81.3k | } else { |
457 | 81.3k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); |
458 | 81.3k | } |
459 | | |
460 | 81.3k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking |
461 | | |
462 | 4.48M | for (int i = 0; i < full_batch_size; i += u_size) { |
463 | 4.40M | s = 0; |
464 | | |
465 | 4.40M | s = to_endian<std::endian::big>(*((U*)(input + i))); |
466 | | |
467 | | // Determine what the valid bits are based on u_size |
468 | 4.40M | valid_bit = u_size << 3; |
469 | | |
470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. |
471 | 4.40M | if (tail_count == 0 && i == full_batch_size - u_size) { |
472 | 21.7k | valid_bit -= more_bit; |
473 | 21.7k | s >>= more_bit; |
474 | 21.7k | } |
475 | | |
476 | 4.40M | if (need_bit) { |
477 | | // The last time we take away the high bit_width - need_bit, |
478 | | // we need to make up the rest of the need_bit from the width. |
479 | | // Use valid_bit - need_bit to compute high need_bit bits of s |
480 | | // perform an AND operation to ensure that only need_bit bits are valid |
481 | 4.09M | auto mask = (static_cast<U>(1) << need_bit) - 1; |
482 | 4.09M | auto shifted = s >> (valid_bit - need_bit); |
483 | 4.09M | auto masked_result = shifted & mask; |
484 | 4.09M | if constexpr (sizeof(T) <= 4) { |
485 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); |
486 | 4.09M | } else { |
487 | 4.09M | *output |= static_cast<T>(masked_result); |
488 | 4.09M | } |
489 | 4.09M | output++; |
490 | 4.09M | valid_bit -= need_bit; |
491 | 4.09M | } |
492 | | |
493 | 4.40M | int num = valid_bit / bit_width; // How many outputs can be processed at a time |
494 | 4.40M | int remainder = valid_bit - num * bit_width; // How many bits are left to store |
495 | | |
496 | | // Starting with the highest valid bit, take out bit_width bits in sequence |
497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid |
498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end |
499 | | // But since there are still remainder bits that can't be processed, need to add the remainder |
500 | 8.51M | for (int j = 0; j < num; j++) { |
501 | 4.11M | *output = |
502 | 4.11M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); |
503 | 4.11M | output++; |
504 | 4.11M | } |
505 | | |
506 | 4.40M | if (remainder) { |
507 | | // Process the last remaining remainder bit. |
508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. |
509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit |
510 | 4.14M | if constexpr (sizeof(T) <= 4) { |
511 | 0 | auto masked_value = static_cast<T>( |
512 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); |
513 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); |
514 | 4.14M | } else { |
515 | 4.14M | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); |
516 | 4.14M | *output = static_cast<T>(masked_value << (bit_width - remainder)); |
517 | 4.14M | } |
518 | | // Already have remainder bits, next time need bit_width - remainder bits |
519 | 4.14M | need_bit = bit_width - remainder; |
520 | 4.14M | } else { |
521 | 257k | need_bit = 0; |
522 | 257k | } |
523 | 4.40M | } |
524 | | |
525 | | // remainder |
526 | 81.3k | if (tail_count) { |
527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit |
528 | 477k | for (int i = 0; i < tail_count; i++) { |
529 | 417k | s <<= 8; |
530 | 417k | s |= input[full_batch_size + i]; |
531 | 417k | } |
532 | | |
533 | | // tail * 8 is the number of bits that are left to process |
534 | | // tail * 8 - more_bit is to remove the last more_bit |
535 | 59.2k | valid_bit = (tail_count << 3) - more_bit; |
536 | 59.2k | s >>= more_bit; |
537 | | |
538 | | // same as before |
539 | 59.2k | if (need_bit) { |
540 | 54.0k | if constexpr (sizeof(T) <= 4) { |
541 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( |
542 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); |
543 | 54.0k | } else { |
544 | 54.0k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & |
545 | 54.0k | ((static_cast<U>(1) << need_bit) - 1)); |
546 | 54.0k | } |
547 | 54.0k | output++; |
548 | 54.0k | valid_bit -= need_bit; |
549 | 54.0k | } |
550 | | |
551 | 59.2k | int num = valid_bit / bit_width; // How many outputs can be processed at a time |
552 | | |
553 | | // same as before |
554 | 126k | for (int j = 0; j < num; j++) { |
555 | 67.2k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); |
556 | 67.2k | output++; |
557 | 67.2k | } |
558 | 59.2k | } |
559 | 81.3k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeIlEEvPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeInEEvPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeIlEEvPKhhiPs Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeInEEvPKhhiPs _ZN5doris10ForDecoderIiE19bit_unpack_optimizeIlEEvPKhhiPi Line | Count | Source | 437 | 9 | T* output) { | 438 | 9 | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 439 | 9 | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 440 | 9 | constexpr int u_size = sizeof(U); // Size of U | 441 | 9 | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 442 | 9 | int valid_bit = 0; // How many valid bits | 443 | 9 | int need_bit = 0; // still need | 444 | 9 | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 445 | 9 | int full_batch_size = | 446 | 9 | cast_set<int>((input_size >> u_size_shift) | 447 | 9 | << u_size_shift); // Adjust input_size to a multiple of u_size | 448 | 9 | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 449 | | // The number of bits in input to adjust to multiples of 8 and thus more | 450 | 9 | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 451 | | | 452 | | // to ensure that only bit_width bits are valid | 453 | 9 | T output_mask; | 454 | 9 | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 455 | 0 | output_mask = static_cast<T>(~T(0)); | 456 | 9 | } else { | 457 | 9 | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 458 | 9 | } | 459 | | | 460 | 9 | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 461 | | | 462 | 21 | for (int i = 0; i < full_batch_size; i += u_size) { | 463 | 12 | s = 0; | 464 | | | 465 | 12 | s = to_endian<std::endian::big>(*((U*)(input + i))); | 466 | | | 467 | | // Determine what the valid bits are based on u_size | 468 | 12 | valid_bit = u_size << 3; | 469 | | | 470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 471 | 12 | if (tail_count == 0 && i == full_batch_size - u_size) { | 472 | 7 | valid_bit -= more_bit; | 473 | 7 | s >>= more_bit; | 474 | 7 | } | 475 | | | 476 | 12 | if (need_bit) { | 477 | | // The last time we take away the high bit_width - need_bit, | 478 | | // we need to make up the rest of the need_bit from the width. | 479 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 480 | | // perform an AND operation to ensure that only need_bit bits are valid | 481 | 0 | auto mask = (static_cast<U>(1) << need_bit) - 1; | 482 | 0 | auto shifted = s >> (valid_bit - need_bit); | 483 | 0 | auto masked_result = shifted & mask; | 484 | 0 | if constexpr (sizeof(T) <= 4) { | 485 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 486 | | } else { | 487 | | *output |= static_cast<T>(masked_result); | 488 | | } | 489 | 0 | output++; | 490 | 0 | valid_bit -= need_bit; | 491 | 0 | } | 492 | | | 493 | 12 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 494 | 12 | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 495 | | | 496 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 499 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 500 | 780 | for (int j = 0; j < num; j++) { | 501 | 768 | *output = | 502 | 768 | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 503 | 768 | output++; | 504 | 768 | } | 505 | | | 506 | 12 | if (remainder) { | 507 | | // Process the last remaining remainder bit. | 508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 510 | 0 | if constexpr (sizeof(T) <= 4) { | 511 | 0 | auto masked_value = static_cast<T>( | 512 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 513 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 514 | | } else { | 515 | | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 516 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 517 | | } | 518 | | // Already have remainder bits, next time need bit_width - remainder bits | 519 | 0 | need_bit = bit_width - remainder; | 520 | 12 | } else { | 521 | 12 | need_bit = 0; | 522 | 12 | } | 523 | 12 | } | 524 | | | 525 | | // remainder | 526 | 9 | if (tail_count) { | 527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 528 | 2 | for (int i = 0; i < tail_count; i++) { | 529 | 1 | s <<= 8; | 530 | 1 | s |= input[full_batch_size + i]; | 531 | 1 | } | 532 | | | 533 | | // tail * 8 is the number of bits that are left to process | 534 | | // tail * 8 - more_bit is to remove the last more_bit | 535 | 1 | valid_bit = (tail_count << 3) - more_bit; | 536 | 1 | s >>= more_bit; | 537 | | | 538 | | // same as before | 539 | 1 | if (need_bit) { | 540 | 0 | if constexpr (sizeof(T) <= 4) { | 541 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( | 542 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 543 | | } else { | 544 | | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 545 | | ((static_cast<U>(1) << need_bit) - 1)); | 546 | | } | 547 | 0 | output++; | 548 | 0 | valid_bit -= need_bit; | 549 | 0 | } | 550 | | | 551 | 1 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 552 | | | 553 | | // same as before | 554 | 3 | for (int j = 0; j < num; j++) { | 555 | 2 | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 556 | 2 | output++; | 557 | 2 | } | 558 | 1 | } | 559 | 9 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIiE19bit_unpack_optimizeInEEvPKhhiPi _ZN5doris10ForDecoderIlE19bit_unpack_optimizeIlEEvPKhhiPl Line | Count | Source | 437 | 12.3k | T* output) { | 438 | 12.3k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 439 | 12.3k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 440 | 12.3k | constexpr int u_size = sizeof(U); // Size of U | 441 | 12.3k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 442 | 12.3k | int valid_bit = 0; // How many valid bits | 443 | 12.3k | int need_bit = 0; // still need | 444 | 12.3k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 445 | 12.3k | int full_batch_size = | 446 | 12.3k | cast_set<int>((input_size >> u_size_shift) | 447 | 12.3k | << u_size_shift); // Adjust input_size to a multiple of u_size | 448 | 12.3k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 449 | | // The number of bits in input to adjust to multiples of 8 and thus more | 450 | 12.3k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 451 | | | 452 | | // to ensure that only bit_width bits are valid | 453 | 12.3k | T output_mask; | 454 | 12.3k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 455 | 0 | output_mask = static_cast<T>(~T(0)); | 456 | 12.3k | } else { | 457 | 12.3k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 458 | 12.3k | } | 459 | | | 460 | 12.3k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 461 | | | 462 | 278k | for (int i = 0; i < full_batch_size; i += u_size) { | 463 | 266k | s = 0; | 464 | | | 465 | 266k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 466 | | | 467 | | // Determine what the valid bits are based on u_size | 468 | 266k | valid_bit = u_size << 3; | 469 | | | 470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 471 | 266k | if (tail_count == 0 && i == full_batch_size - u_size) { | 472 | 5.19k | valid_bit -= more_bit; | 473 | 5.19k | s >>= more_bit; | 474 | 5.19k | } | 475 | | | 476 | 266k | if (need_bit) { | 477 | | // The last time we take away the high bit_width - need_bit, | 478 | | // we need to make up the rest of the need_bit from the width. | 479 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 480 | | // perform an AND operation to ensure that only need_bit bits are valid | 481 | 207k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 482 | 207k | auto shifted = s >> (valid_bit - need_bit); | 483 | 207k | auto masked_result = shifted & mask; | 484 | | if constexpr (sizeof(T) <= 4) { | 485 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 486 | 207k | } else { | 487 | 207k | *output |= static_cast<T>(masked_result); | 488 | 207k | } | 489 | 207k | output++; | 490 | 207k | valid_bit -= need_bit; | 491 | 207k | } | 492 | | | 493 | 266k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 494 | 266k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 495 | | | 496 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 499 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 500 | 1.07M | for (int j = 0; j < num; j++) { | 501 | 809k | *output = | 502 | 809k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 503 | 809k | output++; | 504 | 809k | } | 505 | | | 506 | 266k | if (remainder) { | 507 | | // Process the last remaining remainder bit. | 508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 510 | | if constexpr (sizeof(T) <= 4) { | 511 | | auto masked_value = static_cast<T>( | 512 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 513 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 514 | 212k | } else { | 515 | 212k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 516 | 212k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 517 | 212k | } | 518 | | // Already have remainder bits, next time need bit_width - remainder bits | 519 | 212k | need_bit = bit_width - remainder; | 520 | 212k | } else { | 521 | 53.9k | need_bit = 0; | 522 | 53.9k | } | 523 | 266k | } | 524 | | | 525 | | // remainder | 526 | 12.3k | if (tail_count) { | 527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 528 | 35.0k | for (int i = 0; i < tail_count; i++) { | 529 | 28.0k | s <<= 8; | 530 | 28.0k | s |= input[full_batch_size + i]; | 531 | 28.0k | } | 532 | | | 533 | | // tail * 8 is the number of bits that are left to process | 534 | | // tail * 8 - more_bit is to remove the last more_bit | 535 | 6.98k | valid_bit = (tail_count << 3) - more_bit; | 536 | 6.98k | s >>= more_bit; | 537 | | | 538 | | // same as before | 539 | 6.98k | if (need_bit) { | 540 | | if constexpr (sizeof(T) <= 4) { | 541 | | *output |= static_cast<T>(static_cast<uint32_t>( | 542 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 543 | 5.14k | } else { | 544 | 5.14k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 545 | 5.14k | ((static_cast<U>(1) << need_bit) - 1)); | 546 | 5.14k | } | 547 | 5.14k | output++; | 548 | 5.14k | valid_bit -= need_bit; | 549 | 5.14k | } | 550 | | | 551 | 6.98k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 552 | | | 553 | | // same as before | 554 | 30.6k | for (int j = 0; j < num; j++) { | 555 | 23.6k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 556 | 23.6k | output++; | 557 | 23.6k | } | 558 | 6.98k | } | 559 | 12.3k | } |
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeInEEvPKhhiPl Line | Count | Source | 437 | 12.1k | T* output) { | 438 | 12.1k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 439 | 12.1k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 440 | 12.1k | constexpr int u_size = sizeof(U); // Size of U | 441 | 12.1k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 442 | 12.1k | int valid_bit = 0; // How many valid bits | 443 | 12.1k | int need_bit = 0; // still need | 444 | 12.1k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 445 | 12.1k | int full_batch_size = | 446 | 12.1k | cast_set<int>((input_size >> u_size_shift) | 447 | 12.1k | << u_size_shift); // Adjust input_size to a multiple of u_size | 448 | 12.1k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 449 | | // The number of bits in input to adjust to multiples of 8 and thus more | 450 | 12.1k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 451 | | | 452 | | // to ensure that only bit_width bits are valid | 453 | 12.1k | T output_mask; | 454 | 12.1k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 455 | 0 | output_mask = static_cast<T>(~T(0)); | 456 | 12.1k | } else { | 457 | 12.1k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 458 | 12.1k | } | 459 | | | 460 | 12.1k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 461 | | | 462 | 403k | for (int i = 0; i < full_batch_size; i += u_size) { | 463 | 391k | s = 0; | 464 | | | 465 | 391k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 466 | | | 467 | | // Determine what the valid bits are based on u_size | 468 | 391k | valid_bit = u_size << 3; | 469 | | | 470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 471 | 391k | if (tail_count == 0 && i == full_batch_size - u_size) { | 472 | 4.55k | valid_bit -= more_bit; | 473 | 4.55k | s >>= more_bit; | 474 | 4.55k | } | 475 | | | 476 | 391k | if (need_bit) { | 477 | | // The last time we take away the high bit_width - need_bit, | 478 | | // we need to make up the rest of the need_bit from the width. | 479 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 480 | | // perform an AND operation to ensure that only need_bit bits are valid | 481 | 367k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 482 | 367k | auto shifted = s >> (valid_bit - need_bit); | 483 | 367k | auto masked_result = shifted & mask; | 484 | | if constexpr (sizeof(T) <= 4) { | 485 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 486 | 367k | } else { | 487 | 367k | *output |= static_cast<T>(masked_result); | 488 | 367k | } | 489 | 367k | output++; | 490 | 367k | valid_bit -= need_bit; | 491 | 367k | } | 492 | | | 493 | 391k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 494 | 391k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 495 | | | 496 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 499 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 500 | 1.05M | for (int j = 0; j < num; j++) { | 501 | 663k | *output = | 502 | 663k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 503 | 663k | output++; | 504 | 663k | } | 505 | | | 506 | 391k | if (remainder) { | 507 | | // Process the last remaining remainder bit. | 508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 510 | | if constexpr (sizeof(T) <= 4) { | 511 | | auto masked_value = static_cast<T>( | 512 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 513 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 514 | 374k | } else { | 515 | 374k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 516 | 374k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 517 | 374k | } | 518 | | // Already have remainder bits, next time need bit_width - remainder bits | 519 | 374k | need_bit = bit_width - remainder; | 520 | 374k | } else { | 521 | 16.9k | need_bit = 0; | 522 | 16.9k | } | 523 | 391k | } | 524 | | | 525 | | // remainder | 526 | 12.1k | if (tail_count) { | 527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 528 | 68.5k | for (int i = 0; i < tail_count; i++) { | 529 | 60.9k | s <<= 8; | 530 | 60.9k | s |= input[full_batch_size + i]; | 531 | 60.9k | } | 532 | | | 533 | | // tail * 8 is the number of bits that are left to process | 534 | | // tail * 8 - more_bit is to remove the last more_bit | 535 | 7.60k | valid_bit = (tail_count << 3) - more_bit; | 536 | 7.60k | s >>= more_bit; | 537 | | | 538 | | // same as before | 539 | 7.60k | if (need_bit) { | 540 | | if constexpr (sizeof(T) <= 4) { | 541 | | *output |= static_cast<T>(static_cast<uint32_t>( | 542 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 543 | 7.31k | } else { | 544 | 7.31k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 545 | 7.31k | ((static_cast<U>(1) << need_bit) - 1)); | 546 | 7.31k | } | 547 | 7.31k | output++; | 548 | 7.31k | valid_bit -= need_bit; | 549 | 7.31k | } | 550 | | | 551 | 7.60k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 552 | | | 553 | | // same as before | 554 | 14.1k | for (int j = 0; j < num; j++) { | 555 | 6.51k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 556 | 6.51k | output++; | 557 | 6.51k | } | 558 | 7.60k | } | 559 | 12.1k | } |
_ZN5doris10ForDecoderInE19bit_unpack_optimizeIlEEvPKhhiPn Line | Count | Source | 437 | 8.28k | T* output) { | 438 | 8.28k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 439 | 8.28k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 440 | 8.28k | constexpr int u_size = sizeof(U); // Size of U | 441 | 8.28k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 442 | 8.28k | int valid_bit = 0; // How many valid bits | 443 | 8.28k | int need_bit = 0; // still need | 444 | 8.28k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 445 | 8.28k | int full_batch_size = | 446 | 8.28k | cast_set<int>((input_size >> u_size_shift) | 447 | 8.28k | << u_size_shift); // Adjust input_size to a multiple of u_size | 448 | 8.28k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 449 | | // The number of bits in input to adjust to multiples of 8 and thus more | 450 | 8.28k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 451 | | | 452 | | // to ensure that only bit_width bits are valid | 453 | 8.28k | T output_mask; | 454 | 8.28k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 455 | 0 | output_mask = static_cast<T>(~T(0)); | 456 | 8.28k | } else { | 457 | 8.28k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 458 | 8.28k | } | 459 | | | 460 | 8.28k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 461 | | | 462 | 274k | for (int i = 0; i < full_batch_size; i += u_size) { | 463 | 266k | s = 0; | 464 | | | 465 | 266k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 466 | | | 467 | | // Determine what the valid bits are based on u_size | 468 | 266k | valid_bit = u_size << 3; | 469 | | | 470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 471 | 266k | if (tail_count == 0 && i == full_batch_size - u_size) { | 472 | 1.12k | valid_bit -= more_bit; | 473 | 1.12k | s >>= more_bit; | 474 | 1.12k | } | 475 | | | 476 | 266k | if (need_bit) { | 477 | | // The last time we take away the high bit_width - need_bit, | 478 | | // we need to make up the rest of the need_bit from the width. | 479 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 480 | | // perform an AND operation to ensure that only need_bit bits are valid | 481 | 207k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 482 | 207k | auto shifted = s >> (valid_bit - need_bit); | 483 | 207k | auto masked_result = shifted & mask; | 484 | | if constexpr (sizeof(T) <= 4) { | 485 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 486 | 207k | } else { | 487 | 207k | *output |= static_cast<T>(masked_result); | 488 | 207k | } | 489 | 207k | output++; | 490 | 207k | valid_bit -= need_bit; | 491 | 207k | } | 492 | | | 493 | 266k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 494 | 266k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 495 | | | 496 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 499 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 500 | 1.07M | for (int j = 0; j < num; j++) { | 501 | 808k | *output = | 502 | 808k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 503 | 808k | output++; | 504 | 808k | } | 505 | | | 506 | 266k | if (remainder) { | 507 | | // Process the last remaining remainder bit. | 508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 510 | | if constexpr (sizeof(T) <= 4) { | 511 | | auto masked_value = static_cast<T>( | 512 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 513 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 514 | 212k | } else { | 515 | 212k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 516 | 212k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 517 | 212k | } | 518 | | // Already have remainder bits, next time need bit_width - remainder bits | 519 | 212k | need_bit = bit_width - remainder; | 520 | 212k | } else { | 521 | 53.9k | need_bit = 0; | 522 | 53.9k | } | 523 | 266k | } | 524 | | | 525 | | // remainder | 526 | 8.28k | if (tail_count) { | 527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 528 | 35.2k | for (int i = 0; i < tail_count; i++) { | 529 | 28.1k | s <<= 8; | 530 | 28.1k | s |= input[full_batch_size + i]; | 531 | 28.1k | } | 532 | | | 533 | | // tail * 8 is the number of bits that are left to process | 534 | | // tail * 8 - more_bit is to remove the last more_bit | 535 | 7.04k | valid_bit = (tail_count << 3) - more_bit; | 536 | 7.04k | s >>= more_bit; | 537 | | | 538 | | // same as before | 539 | 7.04k | if (need_bit) { | 540 | | if constexpr (sizeof(T) <= 4) { | 541 | | *output |= static_cast<T>(static_cast<uint32_t>( | 542 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 543 | 5.14k | } else { | 544 | 5.14k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 545 | 5.14k | ((static_cast<U>(1) << need_bit) - 1)); | 546 | 5.14k | } | 547 | 5.14k | output++; | 548 | 5.14k | valid_bit -= need_bit; | 549 | 5.14k | } | 550 | | | 551 | 7.04k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 552 | | | 553 | | // same as before | 554 | 30.7k | for (int j = 0; j < num; j++) { | 555 | 23.7k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 556 | 23.7k | output++; | 557 | 23.7k | } | 558 | 7.04k | } | 559 | 8.28k | } |
_ZN5doris10ForDecoderInE19bit_unpack_optimizeInEEvPKhhiPn Line | Count | Source | 437 | 48.5k | T* output) { | 438 | 48.5k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 439 | 48.5k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 440 | 48.5k | constexpr int u_size = sizeof(U); // Size of U | 441 | 48.5k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 442 | 48.5k | int valid_bit = 0; // How many valid bits | 443 | 48.5k | int need_bit = 0; // still need | 444 | 48.5k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 445 | 48.5k | int full_batch_size = | 446 | 48.5k | cast_set<int>((input_size >> u_size_shift) | 447 | 48.5k | << u_size_shift); // Adjust input_size to a multiple of u_size | 448 | 48.5k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 449 | | // The number of bits in input to adjust to multiples of 8 and thus more | 450 | 48.5k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 451 | | | 452 | | // to ensure that only bit_width bits are valid | 453 | 48.5k | T output_mask; | 454 | 48.5k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 455 | 0 | output_mask = static_cast<T>(~T(0)); | 456 | 48.5k | } else { | 457 | 48.5k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 458 | 48.5k | } | 459 | | | 460 | 48.5k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 461 | | | 462 | 3.52M | for (int i = 0; i < full_batch_size; i += u_size) { | 463 | 3.47M | s = 0; | 464 | | | 465 | 3.47M | s = to_endian<std::endian::big>(*((U*)(input + i))); | 466 | | | 467 | | // Determine what the valid bits are based on u_size | 468 | 3.47M | valid_bit = u_size << 3; | 469 | | | 470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 471 | 3.47M | if (tail_count == 0 && i == full_batch_size - u_size) { | 472 | 10.8k | valid_bit -= more_bit; | 473 | 10.8k | s >>= more_bit; | 474 | 10.8k | } | 475 | | | 476 | 3.47M | if (need_bit) { | 477 | | // The last time we take away the high bit_width - need_bit, | 478 | | // we need to make up the rest of the need_bit from the width. | 479 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 480 | | // perform an AND operation to ensure that only need_bit bits are valid | 481 | 3.30M | auto mask = (static_cast<U>(1) << need_bit) - 1; | 482 | 3.30M | auto shifted = s >> (valid_bit - need_bit); | 483 | 3.30M | auto masked_result = shifted & mask; | 484 | | if constexpr (sizeof(T) <= 4) { | 485 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 486 | 3.30M | } else { | 487 | 3.30M | *output |= static_cast<T>(masked_result); | 488 | 3.30M | } | 489 | 3.30M | output++; | 490 | 3.30M | valid_bit -= need_bit; | 491 | 3.30M | } | 492 | | | 493 | 3.47M | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 494 | 3.47M | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 495 | | | 496 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 499 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 500 | 5.30M | for (int j = 0; j < num; j++) { | 501 | 1.83M | *output = | 502 | 1.83M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 503 | 1.83M | output++; | 504 | 1.83M | } | 505 | | | 506 | 3.47M | if (remainder) { | 507 | | // Process the last remaining remainder bit. | 508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 510 | | if constexpr (sizeof(T) <= 4) { | 511 | | auto masked_value = static_cast<T>( | 512 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 513 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 514 | 3.34M | } else { | 515 | 3.34M | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 516 | 3.34M | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 517 | 3.34M | } | 518 | | // Already have remainder bits, next time need bit_width - remainder bits | 519 | 3.34M | need_bit = bit_width - remainder; | 520 | 3.34M | } else { | 521 | 132k | need_bit = 0; | 522 | 132k | } | 523 | 3.47M | } | 524 | | | 525 | | // remainder | 526 | 48.5k | if (tail_count) { | 527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 528 | 338k | for (int i = 0; i < tail_count; i++) { | 529 | 300k | s <<= 8; | 530 | 300k | s |= input[full_batch_size + i]; | 531 | 300k | } | 532 | | | 533 | | // tail * 8 is the number of bits that are left to process | 534 | | // tail * 8 - more_bit is to remove the last more_bit | 535 | 37.6k | valid_bit = (tail_count << 3) - more_bit; | 536 | 37.6k | s >>= more_bit; | 537 | | | 538 | | // same as before | 539 | 37.6k | if (need_bit) { | 540 | | if constexpr (sizeof(T) <= 4) { | 541 | | *output |= static_cast<T>(static_cast<uint32_t>( | 542 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 543 | 36.4k | } else { | 544 | 36.4k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 545 | 36.4k | ((static_cast<U>(1) << need_bit) - 1)); | 546 | 36.4k | } | 547 | 36.4k | output++; | 548 | 36.4k | valid_bit -= need_bit; | 549 | 36.4k | } | 550 | | | 551 | 37.6k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 552 | | | 553 | | // same as before | 554 | 50.9k | for (int j = 0; j < num; j++) { | 555 | 13.3k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 556 | 13.3k | output++; | 557 | 13.3k | } | 558 | 37.6k | } | 559 | 48.5k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeIlEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeInEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeIlEEvPKhhiPt Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeInEEvPKhhiPt _ZN5doris10ForDecoderIjE19bit_unpack_optimizeIlEEvPKhhiPj Line | Count | Source | 437 | 5 | T* output) { | 438 | 5 | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 439 | 5 | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 440 | 5 | constexpr int u_size = sizeof(U); // Size of U | 441 | 5 | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 442 | 5 | int valid_bit = 0; // How many valid bits | 443 | 5 | int need_bit = 0; // still need | 444 | 5 | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 445 | 5 | int full_batch_size = | 446 | 5 | cast_set<int>((input_size >> u_size_shift) | 447 | 5 | << u_size_shift); // Adjust input_size to a multiple of u_size | 448 | 5 | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 449 | | // The number of bits in input to adjust to multiples of 8 and thus more | 450 | 5 | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 451 | | | 452 | | // to ensure that only bit_width bits are valid | 453 | 5 | T output_mask; | 454 | 5 | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 455 | 0 | output_mask = static_cast<T>(~T(0)); | 456 | 5 | } else { | 457 | 5 | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 458 | 5 | } | 459 | | | 460 | 5 | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 461 | | | 462 | 15 | for (int i = 0; i < full_batch_size; i += u_size) { | 463 | 10 | s = 0; | 464 | | | 465 | 10 | s = to_endian<std::endian::big>(*((U*)(input + i))); | 466 | | | 467 | | // Determine what the valid bits are based on u_size | 468 | 10 | valid_bit = u_size << 3; | 469 | | | 470 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 471 | 10 | if (tail_count == 0 && i == full_batch_size - u_size) { | 472 | 5 | valid_bit -= more_bit; | 473 | 5 | s >>= more_bit; | 474 | 5 | } | 475 | | | 476 | 10 | if (need_bit) { | 477 | | // The last time we take away the high bit_width - need_bit, | 478 | | // we need to make up the rest of the need_bit from the width. | 479 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 480 | | // perform an AND operation to ensure that only need_bit bits are valid | 481 | 0 | auto mask = (static_cast<U>(1) << need_bit) - 1; | 482 | 0 | auto shifted = s >> (valid_bit - need_bit); | 483 | 0 | auto masked_result = shifted & mask; | 484 | 0 | if constexpr (sizeof(T) <= 4) { | 485 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 486 | | } else { | 487 | | *output |= static_cast<T>(masked_result); | 488 | | } | 489 | 0 | output++; | 490 | 0 | valid_bit -= need_bit; | 491 | 0 | } | 492 | | | 493 | 10 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 494 | 10 | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 495 | | | 496 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 497 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 498 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 499 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 500 | 650 | for (int j = 0; j < num; j++) { | 501 | 640 | *output = | 502 | 640 | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 503 | 640 | output++; | 504 | 640 | } | 505 | | | 506 | 10 | if (remainder) { | 507 | | // Process the last remaining remainder bit. | 508 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 509 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 510 | 0 | if constexpr (sizeof(T) <= 4) { | 511 | 0 | auto masked_value = static_cast<T>( | 512 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 513 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 514 | | } else { | 515 | | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 516 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 517 | | } | 518 | | // Already have remainder bits, next time need bit_width - remainder bits | 519 | 0 | need_bit = bit_width - remainder; | 520 | 10 | } else { | 521 | 10 | need_bit = 0; | 522 | 10 | } | 523 | 10 | } | 524 | | | 525 | | // remainder | 526 | 5 | if (tail_count) { | 527 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 528 | 0 | for (int i = 0; i < tail_count; i++) { | 529 | 0 | s <<= 8; | 530 | 0 | s |= input[full_batch_size + i]; | 531 | 0 | } | 532 | | | 533 | | // tail * 8 is the number of bits that are left to process | 534 | | // tail * 8 - more_bit is to remove the last more_bit | 535 | 0 | valid_bit = (tail_count << 3) - more_bit; | 536 | 0 | s >>= more_bit; | 537 | | | 538 | | // same as before | 539 | 0 | if (need_bit) { | 540 | 0 | if constexpr (sizeof(T) <= 4) { | 541 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( | 542 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 543 | | } else { | 544 | | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 545 | | ((static_cast<U>(1) << need_bit) - 1)); | 546 | | } | 547 | 0 | output++; | 548 | 0 | valid_bit -= need_bit; | 549 | 0 | } | 550 | |
| 551 | 0 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 552 | | | 553 | | // same as before | 554 | 0 | for (int j = 0; j < num; j++) { | 555 | 0 | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 556 | 0 | output++; | 557 | 0 | } | 558 | 0 | } | 559 | 5 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIjE19bit_unpack_optimizeInEEvPKhhiPj Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeIlEEvPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeInEEvPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeIlEEvPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeInEEvPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeIlEEvPKhhiPo Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeInEEvPKhhiPo |
560 | | |
561 | | // The reverse of bit_pack method, get original integer data list from packed bits |
562 | | // param[in] input: the packed bits need to unpack |
563 | | // param[in] in_num: the integer number in packed bits |
564 | | // param[in] bit_width: how many bit we used to store each integer data |
565 | | // param[out] output: the original integer data list |
566 | | template <typename T> |
567 | 81.3k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { |
568 | | /* |
569 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. |
570 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. |
571 | | */ |
572 | 81.3k | if (bit_width <= 32) { |
573 | 20.6k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); |
574 | 60.6k | } else { |
575 | 60.6k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); |
576 | 60.6k | } |
577 | 81.3k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE10bit_unpackEPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE10bit_unpackEPKhhiPs _ZN5doris10ForDecoderIiE10bit_unpackEPKhhiPi Line | Count | Source | 567 | 9 | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 568 | | /* | 569 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 570 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 571 | | */ | 572 | 9 | if (bit_width <= 32) { | 573 | 9 | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 574 | 9 | } else { | 575 | 0 | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 576 | 0 | } | 577 | 9 | } |
_ZN5doris10ForDecoderIlE10bit_unpackEPKhhiPl Line | Count | Source | 567 | 24.4k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 568 | | /* | 569 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 570 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 571 | | */ | 572 | 24.4k | if (bit_width <= 32) { | 573 | 12.3k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 574 | 12.3k | } else { | 575 | 12.1k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 576 | 12.1k | } | 577 | 24.4k | } |
_ZN5doris10ForDecoderInE10bit_unpackEPKhhiPn Line | Count | Source | 567 | 56.8k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 568 | | /* | 569 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 570 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 571 | | */ | 572 | 56.8k | if (bit_width <= 32) { | 573 | 8.28k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 574 | 48.5k | } else { | 575 | 48.5k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 576 | 48.5k | } | 577 | 56.8k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10bit_unpackEPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderItE10bit_unpackEPKhhiPt _ZN5doris10ForDecoderIjE10bit_unpackEPKhhiPj Line | Count | Source | 567 | 5 | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 568 | | /* | 569 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 570 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 571 | | */ | 572 | 5 | if (bit_width <= 32) { | 573 | 5 | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 574 | 5 | } else { | 575 | 0 | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 576 | 0 | } | 577 | 5 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE10bit_unpackEPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10bit_unpackEPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE10bit_unpackEPKhhiPo |
578 | | |
579 | | template <typename T> |
580 | 4.17M | void ForDecoder<T>::decode_current_frame(T* output) { |
581 | 4.17M | uint32_t frame_index = _current_index / _max_frame_size; |
582 | 4.17M | if (frame_index == _current_decoded_frame) { |
583 | 4.12M | return; // current frame already decoded |
584 | 4.12M | } |
585 | 48.9k | _current_decoded_frame = frame_index; |
586 | 48.9k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); |
587 | | |
588 | 48.9k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; |
589 | 48.9k | T min = 0; |
590 | 48.9k | uint32_t delta_offset = 0; |
591 | 48.9k | if constexpr (sizeof(T) == 16) { |
592 | 24.4k | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); |
593 | 24.4k | delta_offset = base_offset + 16; |
594 | 24.4k | } else if constexpr (sizeof(T) == 8) { |
595 | 24.4k | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); |
596 | 24.4k | delta_offset = base_offset + 8; |
597 | 24.4k | } else { |
598 | 14 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); |
599 | 14 | delta_offset = base_offset + 4; |
600 | 14 | } |
601 | | |
602 | 48.9k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; |
603 | | |
604 | 48.9k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; |
605 | 48.9k | if (is_original_value) { |
606 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); |
607 | 48.9k | } else { |
608 | 48.9k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; |
609 | 48.9k | std::vector<T> delta_values(current_frame_size); |
610 | 48.9k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); |
611 | 48.9k | if (is_ascending) { |
612 | 451 | T pre_value = min; |
613 | 3.87k | for (uint8_t i = 0; i < current_frame_size; i++) { |
614 | 3.42k | T value = delta_values[i] + pre_value; |
615 | 3.42k | output[i] = value; |
616 | 3.42k | pre_value = value; |
617 | 3.42k | } |
618 | 48.4k | } else { |
619 | 4.22M | for (uint8_t i = 0; i < current_frame_size; i++) { |
620 | 4.17M | output[i] = delta_values[i] + min; |
621 | 4.17M | } |
622 | 48.4k | } |
623 | 48.9k | } |
624 | 48.9k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE20decode_current_frameEPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE20decode_current_frameEPs _ZN5doris10ForDecoderIiE20decode_current_frameEPi Line | Count | Source | 580 | 10 | void ForDecoder<T>::decode_current_frame(T* output) { | 581 | 10 | uint32_t frame_index = _current_index / _max_frame_size; | 582 | 10 | if (frame_index == _current_decoded_frame) { | 583 | 1 | return; // current frame already decoded | 584 | 1 | } | 585 | 9 | _current_decoded_frame = frame_index; | 586 | 9 | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 587 | | | 588 | 9 | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 589 | 9 | T min = 0; | 590 | 9 | uint32_t delta_offset = 0; | 591 | | if constexpr (sizeof(T) == 16) { | 592 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 593 | | delta_offset = base_offset + 16; | 594 | | } else if constexpr (sizeof(T) == 8) { | 595 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 596 | | delta_offset = base_offset + 8; | 597 | 9 | } else { | 598 | 9 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 599 | 9 | delta_offset = base_offset + 4; | 600 | 9 | } | 601 | | | 602 | 9 | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 603 | | | 604 | 9 | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 605 | 9 | if (is_original_value) { | 606 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 607 | 9 | } else { | 608 | 9 | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 609 | 9 | std::vector<T> delta_values(current_frame_size); | 610 | 9 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 611 | 9 | if (is_ascending) { | 612 | 9 | T pre_value = min; | 613 | 780 | for (uint8_t i = 0; i < current_frame_size; i++) { | 614 | 771 | T value = delta_values[i] + pre_value; | 615 | 771 | output[i] = value; | 616 | 771 | pre_value = value; | 617 | 771 | } | 618 | 9 | } else { | 619 | 0 | for (uint8_t i = 0; i < current_frame_size; i++) { | 620 | 0 | output[i] = delta_values[i] + min; | 621 | 0 | } | 622 | 0 | } | 623 | 9 | } | 624 | 9 | } |
_ZN5doris10ForDecoderIlE20decode_current_frameEPl Line | Count | Source | 580 | 2.08M | void ForDecoder<T>::decode_current_frame(T* output) { | 581 | 2.08M | uint32_t frame_index = _current_index / _max_frame_size; | 582 | 2.08M | if (frame_index == _current_decoded_frame) { | 583 | 2.06M | return; // current frame already decoded | 584 | 2.06M | } | 585 | 24.4k | _current_decoded_frame = frame_index; | 586 | 24.4k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 587 | | | 588 | 24.4k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 589 | 24.4k | T min = 0; | 590 | 24.4k | uint32_t delta_offset = 0; | 591 | | if constexpr (sizeof(T) == 16) { | 592 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 593 | | delta_offset = base_offset + 16; | 594 | 24.4k | } else if constexpr (sizeof(T) == 8) { | 595 | 24.4k | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 596 | 24.4k | delta_offset = base_offset + 8; | 597 | | } else { | 598 | | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 599 | | delta_offset = base_offset + 4; | 600 | | } | 601 | | | 602 | 24.4k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 603 | | | 604 | 24.4k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 605 | 24.4k | if (is_original_value) { | 606 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 607 | 24.4k | } else { | 608 | 24.4k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 609 | 24.4k | std::vector<T> delta_values(current_frame_size); | 610 | 24.4k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 611 | 24.4k | if (is_ascending) { | 612 | 223 | T pre_value = min; | 613 | 1.89k | for (uint8_t i = 0; i < current_frame_size; i++) { | 614 | 1.67k | T value = delta_values[i] + pre_value; | 615 | 1.67k | output[i] = value; | 616 | 1.67k | pre_value = value; | 617 | 1.67k | } | 618 | 24.2k | } else { | 619 | 2.11M | for (uint8_t i = 0; i < current_frame_size; i++) { | 620 | 2.08M | output[i] = delta_values[i] + min; | 621 | 2.08M | } | 622 | 24.2k | } | 623 | 24.4k | } | 624 | 24.4k | } |
_ZN5doris10ForDecoderInE20decode_current_frameEPn Line | Count | Source | 580 | 2.08M | void ForDecoder<T>::decode_current_frame(T* output) { | 581 | 2.08M | uint32_t frame_index = _current_index / _max_frame_size; | 582 | 2.08M | if (frame_index == _current_decoded_frame) { | 583 | 2.06M | return; // current frame already decoded | 584 | 2.06M | } | 585 | 24.4k | _current_decoded_frame = frame_index; | 586 | 24.4k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 587 | | | 588 | 24.4k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 589 | 24.4k | T min = 0; | 590 | 24.4k | uint32_t delta_offset = 0; | 591 | 24.4k | if constexpr (sizeof(T) == 16) { | 592 | 24.4k | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 593 | 24.4k | delta_offset = base_offset + 16; | 594 | | } else if constexpr (sizeof(T) == 8) { | 595 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 596 | | delta_offset = base_offset + 8; | 597 | | } else { | 598 | | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 599 | | delta_offset = base_offset + 4; | 600 | | } | 601 | | | 602 | 24.4k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 603 | | | 604 | 24.4k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 605 | 24.4k | if (is_original_value) { | 606 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 607 | 24.4k | } else { | 608 | 24.4k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 609 | 24.4k | std::vector<T> delta_values(current_frame_size); | 610 | 24.4k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 611 | 24.4k | if (is_ascending) { | 612 | 214 | T pre_value = min; | 613 | 552 | for (uint8_t i = 0; i < current_frame_size; i++) { | 614 | 338 | T value = delta_values[i] + pre_value; | 615 | 338 | output[i] = value; | 616 | 338 | pre_value = value; | 617 | 338 | } | 618 | 24.2k | } else { | 619 | 2.11M | for (uint8_t i = 0; i < current_frame_size; i++) { | 620 | 2.08M | output[i] = delta_values[i] + min; | 621 | 2.08M | } | 622 | 24.2k | } | 623 | 24.4k | } | 624 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE20decode_current_frameEPh Unexecuted instantiation: _ZN5doris10ForDecoderItE20decode_current_frameEPt _ZN5doris10ForDecoderIjE20decode_current_frameEPj Line | Count | Source | 580 | 5 | void ForDecoder<T>::decode_current_frame(T* output) { | 581 | 5 | uint32_t frame_index = _current_index / _max_frame_size; | 582 | 5 | if (frame_index == _current_decoded_frame) { | 583 | 0 | return; // current frame already decoded | 584 | 0 | } | 585 | 5 | _current_decoded_frame = frame_index; | 586 | 5 | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 587 | | | 588 | 5 | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 589 | 5 | T min = 0; | 590 | 5 | uint32_t delta_offset = 0; | 591 | | if constexpr (sizeof(T) == 16) { | 592 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 593 | | delta_offset = base_offset + 16; | 594 | | } else if constexpr (sizeof(T) == 8) { | 595 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 596 | | delta_offset = base_offset + 8; | 597 | 5 | } else { | 598 | 5 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 599 | 5 | delta_offset = base_offset + 4; | 600 | 5 | } | 601 | | | 602 | 5 | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 603 | | | 604 | 5 | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 605 | 5 | if (is_original_value) { | 606 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 607 | 5 | } else { | 608 | 5 | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 609 | 5 | std::vector<T> delta_values(current_frame_size); | 610 | 5 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 611 | 5 | if (is_ascending) { | 612 | 5 | T pre_value = min; | 613 | 645 | for (uint8_t i = 0; i < current_frame_size; i++) { | 614 | 640 | T value = delta_values[i] + pre_value; | 615 | 640 | output[i] = value; | 616 | 640 | pre_value = value; | 617 | 640 | } | 618 | 5 | } else { | 619 | 0 | for (uint8_t i = 0; i < current_frame_size; i++) { | 620 | 0 | output[i] = delta_values[i] + min; | 621 | 0 | } | 622 | 0 | } | 623 | 5 | } | 624 | 5 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE20decode_current_frameEPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE20decode_current_frameEPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE20decode_current_frameEPo |
625 | | |
626 | | template <typename T> |
627 | 12 | T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) { |
628 | 12 | uint32_t min_offset = _frame_offsets[frame_index]; |
629 | 12 | T min = 0; |
630 | 12 | if constexpr (sizeof(T) == 16) { |
631 | 0 | min = static_cast<T>(decode_fixed128_le(_buffer + min_offset)); |
632 | 12 | } else if constexpr (sizeof(T) == 8) { |
633 | 12 | min = static_cast<T>(decode_fixed64_le(_buffer + min_offset)); |
634 | 12 | } else { |
635 | 0 | min = static_cast<T>(decode_fixed32_le(_buffer + min_offset)); |
636 | 0 | } |
637 | 12 | return min; |
638 | 12 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIsE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIiE22decode_frame_min_valueEj _ZN5doris10ForDecoderIlE22decode_frame_min_valueEj Line | Count | Source | 627 | 12 | T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) { | 628 | 12 | uint32_t min_offset = _frame_offsets[frame_index]; | 629 | 12 | T min = 0; | 630 | | if constexpr (sizeof(T) == 16) { | 631 | | min = static_cast<T>(decode_fixed128_le(_buffer + min_offset)); | 632 | 12 | } else if constexpr (sizeof(T) == 8) { | 633 | 12 | min = static_cast<T>(decode_fixed64_le(_buffer + min_offset)); | 634 | | } else { | 635 | | min = static_cast<T>(decode_fixed32_le(_buffer + min_offset)); | 636 | | } | 637 | 12 | return min; | 638 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIhE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderItE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIjE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderImE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIoE22decode_frame_min_valueEj |
639 | | |
640 | | template <typename T> |
641 | 4.17M | T* ForDecoder<T>::copy_value(T* val, size_t count) { |
642 | 4.17M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); |
643 | 4.17M | _current_index += count; |
644 | 4.17M | val += count; |
645 | 4.17M | return val; |
646 | 4.17M | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE10copy_valueEPam Unexecuted instantiation: _ZN5doris10ForDecoderIsE10copy_valueEPsm _ZN5doris10ForDecoderIiE10copy_valueEPim Line | Count | Source | 641 | 8 | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 642 | 8 | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 643 | 8 | _current_index += count; | 644 | 8 | val += count; | 645 | 8 | return val; | 646 | 8 | } |
_ZN5doris10ForDecoderIlE10copy_valueEPlm Line | Count | Source | 641 | 2.08M | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 642 | 2.08M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 643 | 2.08M | _current_index += count; | 644 | 2.08M | val += count; | 645 | 2.08M | return val; | 646 | 2.08M | } |
_ZN5doris10ForDecoderInE10copy_valueEPnm Line | Count | Source | 641 | 2.08M | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 642 | 2.08M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 643 | 2.08M | _current_index += count; | 644 | 2.08M | val += count; | 645 | 2.08M | return val; | 646 | 2.08M | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10copy_valueEPhm Unexecuted instantiation: _ZN5doris10ForDecoderItE10copy_valueEPtm _ZN5doris10ForDecoderIjE10copy_valueEPjm Line | Count | Source | 641 | 3 | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 642 | 3 | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 643 | 3 | _current_index += count; | 644 | 3 | val += count; | 645 | 3 | return val; | 646 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE10copy_valueEPmm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10copy_valueEPS1_m Unexecuted instantiation: _ZN5doris10ForDecoderIoE10copy_valueEPom |
647 | | |
648 | | template <typename T> |
649 | 4.17M | bool ForDecoder<T>::get_batch(T* val, size_t count) { |
650 | 4.17M | if (_current_index + count > _values_num) { |
651 | 1 | return false; |
652 | 1 | } |
653 | | |
654 | 4.17M | decode_current_frame(_out_buffer.data()); |
655 | | |
656 | 4.17M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { |
657 | 4.16M | copy_value(val, count); |
658 | 4.16M | return true; |
659 | 4.16M | } |
660 | | |
661 | | // 1. padding one frame |
662 | 16.3k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; |
663 | 16.3k | val = copy_value(val, padding_num); |
664 | | |
665 | | // 2. process frame by frame |
666 | 16.3k | size_t frame_count = (count - padding_num) / _max_frame_size; |
667 | 16.4k | for (size_t i = 0; i < frame_count; i++) { |
668 | | // directly decode value to the output, don't buffer the value |
669 | 7 | decode_current_frame(val); |
670 | 7 | _current_index += _max_frame_size; |
671 | 7 | val += _max_frame_size; |
672 | 7 | } |
673 | | |
674 | | // 3. process remaining value |
675 | 16.3k | size_t remaining_num = (count - padding_num) % _max_frame_size; |
676 | 16.3k | if (remaining_num > 0) { |
677 | 4 | decode_current_frame(_out_buffer.data()); |
678 | 4 | val = copy_value(val, remaining_num); |
679 | 4 | } |
680 | | |
681 | 16.3k | return true; |
682 | 4.17M | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE9get_batchEPam Unexecuted instantiation: _ZN5doris10ForDecoderIsE9get_batchEPsm _ZN5doris10ForDecoderIiE9get_batchEPim Line | Count | Source | 649 | 8 | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 650 | 8 | if (_current_index + count > _values_num) { | 651 | 1 | return false; | 652 | 1 | } | 653 | | | 654 | 7 | decode_current_frame(_out_buffer.data()); | 655 | | | 656 | 7 | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 657 | 4 | copy_value(val, count); | 658 | 4 | return true; | 659 | 4 | } | 660 | | | 661 | | // 1. padding one frame | 662 | 3 | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 663 | 3 | val = copy_value(val, padding_num); | 664 | | | 665 | | // 2. process frame by frame | 666 | 3 | size_t frame_count = (count - padding_num) / _max_frame_size; | 667 | 5 | for (size_t i = 0; i < frame_count; i++) { | 668 | | // directly decode value to the output, don't buffer the value | 669 | 2 | decode_current_frame(val); | 670 | 2 | _current_index += _max_frame_size; | 671 | 2 | val += _max_frame_size; | 672 | 2 | } | 673 | | | 674 | | // 3. process remaining value | 675 | 3 | size_t remaining_num = (count - padding_num) % _max_frame_size; | 676 | 3 | if (remaining_num > 0) { | 677 | 1 | decode_current_frame(_out_buffer.data()); | 678 | 1 | val = copy_value(val, remaining_num); | 679 | 1 | } | 680 | | | 681 | 3 | return true; | 682 | 7 | } |
_ZN5doris10ForDecoderIlE9get_batchEPlm Line | Count | Source | 649 | 2.08M | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 650 | 2.08M | if (_current_index + count > _values_num) { | 651 | 0 | return false; | 652 | 0 | } | 653 | | | 654 | 2.08M | decode_current_frame(_out_buffer.data()); | 655 | | | 656 | 2.08M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 657 | 2.08M | copy_value(val, count); | 658 | 2.08M | return true; | 659 | 2.08M | } | 660 | | | 661 | | // 1. padding one frame | 662 | 8.19k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 663 | 8.19k | val = copy_value(val, padding_num); | 664 | | | 665 | | // 2. process frame by frame | 666 | 8.19k | size_t frame_count = (count - padding_num) / _max_frame_size; | 667 | 8.19k | for (size_t i = 0; i < frame_count; i++) { | 668 | | // directly decode value to the output, don't buffer the value | 669 | 3 | decode_current_frame(val); | 670 | 3 | _current_index += _max_frame_size; | 671 | 3 | val += _max_frame_size; | 672 | 3 | } | 673 | | | 674 | | // 3. process remaining value | 675 | 8.19k | size_t remaining_num = (count - padding_num) % _max_frame_size; | 676 | 8.19k | if (remaining_num > 0) { | 677 | 3 | decode_current_frame(_out_buffer.data()); | 678 | 3 | val = copy_value(val, remaining_num); | 679 | 3 | } | 680 | | | 681 | 8.19k | return true; | 682 | 2.08M | } |
_ZN5doris10ForDecoderInE9get_batchEPnm Line | Count | Source | 649 | 2.08M | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 650 | 2.08M | if (_current_index + count > _values_num) { | 651 | 0 | return false; | 652 | 0 | } | 653 | | | 654 | 2.08M | decode_current_frame(_out_buffer.data()); | 655 | | | 656 | 2.08M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 657 | 2.08M | copy_value(val, count); | 658 | 2.08M | return true; | 659 | 2.08M | } | 660 | | | 661 | | // 1. padding one frame | 662 | 8.19k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 663 | 8.19k | val = copy_value(val, padding_num); | 664 | | | 665 | | // 2. process frame by frame | 666 | 8.19k | size_t frame_count = (count - padding_num) / _max_frame_size; | 667 | 8.19k | for (size_t i = 0; i < frame_count; i++) { | 668 | | // directly decode value to the output, don't buffer the value | 669 | 0 | decode_current_frame(val); | 670 | 0 | _current_index += _max_frame_size; | 671 | 0 | val += _max_frame_size; | 672 | 0 | } | 673 | | | 674 | | // 3. process remaining value | 675 | 8.19k | size_t remaining_num = (count - padding_num) % _max_frame_size; | 676 | 8.19k | if (remaining_num > 0) { | 677 | 0 | decode_current_frame(_out_buffer.data()); | 678 | 0 | val = copy_value(val, remaining_num); | 679 | 0 | } | 680 | | | 681 | 8.19k | return true; | 682 | 2.08M | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE9get_batchEPhm Unexecuted instantiation: _ZN5doris10ForDecoderItE9get_batchEPtm _ZN5doris10ForDecoderIjE9get_batchEPjm Line | Count | Source | 649 | 3 | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 650 | 3 | if (_current_index + count > _values_num) { | 651 | 0 | return false; | 652 | 0 | } | 653 | | | 654 | 3 | decode_current_frame(_out_buffer.data()); | 655 | | | 656 | 3 | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 657 | 0 | copy_value(val, count); | 658 | 0 | return true; | 659 | 0 | } | 660 | | | 661 | | // 1. padding one frame | 662 | 3 | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 663 | 3 | val = copy_value(val, padding_num); | 664 | | | 665 | | // 2. process frame by frame | 666 | 3 | size_t frame_count = (count - padding_num) / _max_frame_size; | 667 | 5 | for (size_t i = 0; i < frame_count; i++) { | 668 | | // directly decode value to the output, don't buffer the value | 669 | 2 | decode_current_frame(val); | 670 | 2 | _current_index += _max_frame_size; | 671 | 2 | val += _max_frame_size; | 672 | 2 | } | 673 | | | 674 | | // 3. process remaining value | 675 | 3 | size_t remaining_num = (count - padding_num) % _max_frame_size; | 676 | 3 | if (remaining_num > 0) { | 677 | 0 | decode_current_frame(_out_buffer.data()); | 678 | 0 | val = copy_value(val, remaining_num); | 679 | 0 | } | 680 | | | 681 | 3 | return true; | 682 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE9get_batchEPmm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE9get_batchEPS1_m Unexecuted instantiation: _ZN5doris10ForDecoderIoE9get_batchEPom |
683 | | |
684 | | template <typename T> |
685 | 3 | bool ForDecoder<T>::skip(int32_t skip_num) { |
686 | 3 | if (_current_index + skip_num >= _values_num) { |
687 | 0 | return false; |
688 | 0 | } |
689 | 3 | _current_index = _current_index + skip_num; |
690 | 3 | return true; |
691 | 3 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIsE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIiE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIlE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderInE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIhE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderItE4skipEi _ZN5doris10ForDecoderIjE4skipEi Line | Count | Source | 685 | 3 | bool ForDecoder<T>::skip(int32_t skip_num) { | 686 | 3 | if (_current_index + skip_num >= _values_num) { | 687 | 0 | return false; | 688 | 0 | } | 689 | 3 | _current_index = _current_index + skip_num; | 690 | 3 | return true; | 691 | 3 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIoE4skipEi |
692 | | |
693 | | template <typename T> |
694 | 6 | uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) { |
695 | | // first of all, find the first frame >= target |
696 | 6 | uint32_t left = 0; |
697 | 6 | uint32_t right = _frame_count; |
698 | 18 | while (left < right) { |
699 | 12 | uint32_t mid = left + (right - left) / 2; |
700 | 12 | T midValue = decode_frame_min_value(mid); |
701 | 12 | if (midValue < target) { |
702 | 6 | left = mid + 1; |
703 | 6 | } else { |
704 | 6 | right = mid; |
705 | 6 | } |
706 | 12 | } |
707 | | // after loop, left is the first frame >= target |
708 | 6 | if (left == 0) { |
709 | | // all frames are >= target, not found |
710 | 2 | return _frame_count; |
711 | 2 | } |
712 | | // otherwise previous frame is the last frame < target |
713 | 4 | return left - 1; |
714 | 6 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE28seek_last_frame_before_valueEa Unexecuted instantiation: _ZN5doris10ForDecoderIsE28seek_last_frame_before_valueEs Unexecuted instantiation: _ZN5doris10ForDecoderIiE28seek_last_frame_before_valueEi _ZN5doris10ForDecoderIlE28seek_last_frame_before_valueEl Line | Count | Source | 694 | 6 | uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) { | 695 | | // first of all, find the first frame >= target | 696 | 6 | uint32_t left = 0; | 697 | 6 | uint32_t right = _frame_count; | 698 | 18 | while (left < right) { | 699 | 12 | uint32_t mid = left + (right - left) / 2; | 700 | 12 | T midValue = decode_frame_min_value(mid); | 701 | 12 | if (midValue < target) { | 702 | 6 | left = mid + 1; | 703 | 6 | } else { | 704 | 6 | right = mid; | 705 | 6 | } | 706 | 12 | } | 707 | | // after loop, left is the first frame >= target | 708 | 6 | if (left == 0) { | 709 | | // all frames are >= target, not found | 710 | 2 | return _frame_count; | 711 | 2 | } | 712 | | // otherwise previous frame is the last frame < target | 713 | 4 | return left - 1; | 714 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE28seek_last_frame_before_valueEn Unexecuted instantiation: _ZN5doris10ForDecoderIhE28seek_last_frame_before_valueEh Unexecuted instantiation: _ZN5doris10ForDecoderItE28seek_last_frame_before_valueEt Unexecuted instantiation: _ZN5doris10ForDecoderIjE28seek_last_frame_before_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderImE28seek_last_frame_before_valueEm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE28seek_last_frame_before_valueES1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE28seek_last_frame_before_valueEo |
715 | | |
716 | | template <typename T> |
717 | | bool ForDecoder<T>::seek_lower_bound_inside_frame(uint32_t frame_index, T target, |
718 | 4 | bool* exact_match) { |
719 | 4 | _current_index = frame_index * _max_frame_size; |
720 | 4 | decode_current_frame(_out_buffer.data()); |
721 | 4 | auto end = _out_buffer.begin() + frame_size(frame_index); |
722 | 4 | auto pos = std::lower_bound(_out_buffer.begin(), end, target); |
723 | 4 | if (pos != end) { // found in this frame |
724 | 2 | auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos)); |
725 | 2 | *exact_match = _out_buffer[pos_in_frame] == target; |
726 | 2 | _current_index += pos_in_frame; |
727 | 2 | return true; |
728 | 2 | } |
729 | 2 | return false; |
730 | 4 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE29seek_lower_bound_inside_frameEjaPb Unexecuted instantiation: _ZN5doris10ForDecoderIsE29seek_lower_bound_inside_frameEjsPb Unexecuted instantiation: _ZN5doris10ForDecoderIiE29seek_lower_bound_inside_frameEjiPb _ZN5doris10ForDecoderIlE29seek_lower_bound_inside_frameEjlPb Line | Count | Source | 718 | 4 | bool* exact_match) { | 719 | 4 | _current_index = frame_index * _max_frame_size; | 720 | 4 | decode_current_frame(_out_buffer.data()); | 721 | 4 | auto end = _out_buffer.begin() + frame_size(frame_index); | 722 | 4 | auto pos = std::lower_bound(_out_buffer.begin(), end, target); | 723 | 4 | if (pos != end) { // found in this frame | 724 | 2 | auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos)); | 725 | 2 | *exact_match = _out_buffer[pos_in_frame] == target; | 726 | 2 | _current_index += pos_in_frame; | 727 | 2 | return true; | 728 | 2 | } | 729 | 2 | return false; | 730 | 4 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE29seek_lower_bound_inside_frameEjnPb Unexecuted instantiation: _ZN5doris10ForDecoderIhE29seek_lower_bound_inside_frameEjhPb Unexecuted instantiation: _ZN5doris10ForDecoderItE29seek_lower_bound_inside_frameEjtPb Unexecuted instantiation: _ZN5doris10ForDecoderIjE29seek_lower_bound_inside_frameEjjPb Unexecuted instantiation: _ZN5doris10ForDecoderImE29seek_lower_bound_inside_frameEjmPb Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE29seek_lower_bound_inside_frameEjS1_Pb Unexecuted instantiation: _ZN5doris10ForDecoderIoE29seek_lower_bound_inside_frameEjoPb |
731 | | |
732 | | template <typename T> |
733 | 6 | bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) { |
734 | 6 | T target = *reinterpret_cast<const T*>(value); |
735 | 6 | uint32_t frame_to_search = seek_last_frame_before_value(target); |
736 | 6 | if (frame_to_search == _frame_count) { |
737 | | // all frames are >= target, the searched value must the be first value |
738 | 2 | _current_index = 0; |
739 | 2 | decode_current_frame(_out_buffer.data()); |
740 | 2 | *exact_match = _out_buffer[0] == target; |
741 | 2 | return true; |
742 | 2 | } |
743 | | // binary search inside the last frame < target |
744 | 4 | bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match); |
745 | | // if not found, all values in the last frame are less than target. |
746 | | // then the searched value must be the first value of the next frame. |
747 | 4 | if (!found && frame_to_search < _frame_count - 1) { |
748 | 1 | _current_index = (frame_to_search + 1) * _max_frame_size; |
749 | 1 | decode_current_frame(_out_buffer.data()); |
750 | 1 | *exact_match = _out_buffer[0] == target; |
751 | 1 | return true; |
752 | 1 | } |
753 | 3 | return found; |
754 | 4 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIsE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIiE22seek_at_or_after_valueEPKvPb _ZN5doris10ForDecoderIlE22seek_at_or_after_valueEPKvPb Line | Count | Source | 733 | 6 | bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) { | 734 | 6 | T target = *reinterpret_cast<const T*>(value); | 735 | 6 | uint32_t frame_to_search = seek_last_frame_before_value(target); | 736 | 6 | if (frame_to_search == _frame_count) { | 737 | | // all frames are >= target, the searched value must the be first value | 738 | 2 | _current_index = 0; | 739 | 2 | decode_current_frame(_out_buffer.data()); | 740 | 2 | *exact_match = _out_buffer[0] == target; | 741 | 2 | return true; | 742 | 2 | } | 743 | | // binary search inside the last frame < target | 744 | 4 | bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match); | 745 | | // if not found, all values in the last frame are less than target. | 746 | | // then the searched value must be the first value of the next frame. | 747 | 4 | if (!found && frame_to_search < _frame_count - 1) { | 748 | 1 | _current_index = (frame_to_search + 1) * _max_frame_size; | 749 | 1 | decode_current_frame(_out_buffer.data()); | 750 | 1 | *exact_match = _out_buffer[0] == target; | 751 | 1 | return true; | 752 | 1 | } | 753 | 3 | return found; | 754 | 4 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIhE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderItE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIjE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderImE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIoE22seek_at_or_after_valueEPKvPb |
755 | | |
756 | | template class ForEncoder<int8_t>; |
757 | | template class ForEncoder<int16_t>; |
758 | | template class ForEncoder<int32_t>; |
759 | | template class ForEncoder<int64_t>; |
760 | | template class ForEncoder<int128_t>; |
761 | | template class ForEncoder<uint8_t>; |
762 | | template class ForEncoder<uint16_t>; |
763 | | template class ForEncoder<uint32_t>; |
764 | | template class ForEncoder<uint64_t>; |
765 | | template class ForEncoder<uint24_t>; |
766 | | template class ForEncoder<uint128_t>; |
767 | | |
768 | | template class ForDecoder<int8_t>; |
769 | | template class ForDecoder<int16_t>; |
770 | | template class ForDecoder<int32_t>; |
771 | | template class ForDecoder<int64_t>; |
772 | | template class ForDecoder<int128_t>; |
773 | | template class ForDecoder<uint8_t>; |
774 | | template class ForDecoder<uint16_t>; |
775 | | template class ForDecoder<uint32_t>; |
776 | | template class ForDecoder<uint64_t>; |
777 | | template class ForDecoder<uint24_t>; |
778 | | template class ForDecoder<uint128_t>; |
779 | | } // namespace doris |