be/src/util/frame_of_reference_coding.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/frame_of_reference_coding.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | #include <sys/types.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <cstring> |
25 | | #include <iostream> |
26 | | #include <iterator> |
27 | | #include <limits> |
28 | | |
29 | | #include "common/cast_set.h" |
30 | | #include "exec/common/endian.h" |
31 | | #include "util/bit_util.h" |
32 | | #include "util/coding.h" |
33 | | |
34 | | namespace doris { |
35 | | #include "common/compile_check_begin.h" |
36 | | |
37 | | template <typename T> |
38 | 8.35M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { |
39 | 8.35M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); |
40 | 8.35M | _buffered_values_num += count; |
41 | 8.35M | p_data += count; |
42 | 8.35M | return p_data; |
43 | 8.35M | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10copy_valueEPKam Unexecuted instantiation: _ZN5doris10ForEncoderIsE10copy_valueEPKsm _ZN5doris10ForEncoderIiE10copy_valueEPKim Line | Count | Source | 38 | 16 | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 16 | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 16 | _buffered_values_num += count; | 41 | 16 | p_data += count; | 42 | 16 | return p_data; | 43 | 16 | } |
_ZN5doris10ForEncoderIlE10copy_valueEPKlm Line | Count | Source | 38 | 4.17M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 4.17M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 4.17M | _buffered_values_num += count; | 41 | 4.17M | p_data += count; | 42 | 4.17M | return p_data; | 43 | 4.17M | } |
_ZN5doris10ForEncoderInE10copy_valueEPKnm Line | Count | Source | 38 | 4.17M | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 4.17M | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 4.17M | _buffered_values_num += count; | 41 | 4.17M | p_data += count; | 42 | 4.17M | return p_data; | 43 | 4.17M | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10copy_valueEPKhm Unexecuted instantiation: _ZN5doris10ForEncoderItE10copy_valueEPKtm _ZN5doris10ForEncoderIjE10copy_valueEPKjm Line | Count | Source | 38 | 6 | const T* ForEncoder<T>::copy_value(const T* p_data, size_t count) { | 39 | 6 | memcpy(&_buffered_values[_buffered_values_num], p_data, count * sizeof(T)); | 40 | 6 | _buffered_values_num += count; | 41 | 6 | p_data += count; | 42 | 6 | return p_data; | 43 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE10copy_valueEPKmm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10copy_valueEPKS1_m Unexecuted instantiation: _ZN5doris10ForEncoderIoE10copy_valueEPKom |
44 | | |
45 | | template <typename T> |
46 | 8.35M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { |
47 | 8.35M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { |
48 | 8.32M | copy_value(in_data, count); |
49 | 8.32M | _values_num += count; |
50 | 8.32M | return; |
51 | 8.32M | } |
52 | | |
53 | | // 1. padding one frame |
54 | 32.7k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; |
55 | 32.7k | in_data = copy_value(in_data, padding_num); |
56 | 32.7k | bit_packing_one_frame_value(_buffered_values); |
57 | | |
58 | | // 2. process frame by frame |
59 | 32.7k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; |
60 | 32.8k | for (size_t i = 0; i < frame_size; i++) { |
61 | | // directly encode value to the bit_writer, don't buffer the value |
62 | 16 | _buffered_values_num = FRAME_VALUE_NUM; |
63 | 16 | bit_packing_one_frame_value(in_data); |
64 | 16 | in_data += FRAME_VALUE_NUM; |
65 | 16 | } |
66 | | |
67 | | // 3. process remaining value |
68 | 32.7k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; |
69 | 32.7k | if (remaining_num > 0) { |
70 | 8 | copy_value(in_data, remaining_num); |
71 | 8 | } |
72 | | |
73 | 32.7k | _values_num += count; |
74 | 32.7k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE9put_batchEPKam Unexecuted instantiation: _ZN5doris10ForEncoderIsE9put_batchEPKsm _ZN5doris10ForEncoderIiE9put_batchEPKim Line | Count | Source | 46 | 14 | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 14 | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 8 | copy_value(in_data, count); | 49 | 8 | _values_num += count; | 50 | 8 | return; | 51 | 8 | } | 52 | | | 53 | | // 1. padding one frame | 54 | 6 | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 6 | in_data = copy_value(in_data, padding_num); | 56 | 6 | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 6 | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 10 | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 4 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 4 | bit_packing_one_frame_value(in_data); | 64 | 4 | in_data += FRAME_VALUE_NUM; | 65 | 4 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 6 | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 6 | if (remaining_num > 0) { | 70 | 2 | copy_value(in_data, remaining_num); | 71 | 2 | } | 72 | | | 73 | 6 | _values_num += count; | 74 | 6 | } |
_ZN5doris10ForEncoderIlE9put_batchEPKlm Line | Count | Source | 46 | 4.17M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 4.17M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 4.16M | copy_value(in_data, count); | 49 | 4.16M | _values_num += count; | 50 | 4.16M | return; | 51 | 4.16M | } | 52 | | | 53 | | // 1. padding one frame | 54 | 16.3k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 16.3k | in_data = copy_value(in_data, padding_num); | 56 | 16.3k | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 16.3k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 16.3k | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 6 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 6 | bit_packing_one_frame_value(in_data); | 64 | 6 | in_data += FRAME_VALUE_NUM; | 65 | 6 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 16.3k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 16.3k | if (remaining_num > 0) { | 70 | 6 | copy_value(in_data, remaining_num); | 71 | 6 | } | 72 | | | 73 | 16.3k | _values_num += count; | 74 | 16.3k | } |
_ZN5doris10ForEncoderInE9put_batchEPKnm Line | Count | Source | 46 | 4.17M | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 4.17M | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 4.16M | copy_value(in_data, count); | 49 | 4.16M | _values_num += count; | 50 | 4.16M | return; | 51 | 4.16M | } | 52 | | | 53 | | // 1. padding one frame | 54 | 16.3k | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 16.3k | in_data = copy_value(in_data, padding_num); | 56 | 16.3k | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 16.3k | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 16.3k | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 0 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 0 | bit_packing_one_frame_value(in_data); | 64 | 0 | in_data += FRAME_VALUE_NUM; | 65 | 0 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 16.3k | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 16.3k | if (remaining_num > 0) { | 70 | 0 | copy_value(in_data, remaining_num); | 71 | 0 | } | 72 | | | 73 | 16.3k | _values_num += count; | 74 | 16.3k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE9put_batchEPKhm Unexecuted instantiation: _ZN5doris10ForEncoderItE9put_batchEPKtm _ZN5doris10ForEncoderIjE9put_batchEPKjm Line | Count | Source | 46 | 6 | void ForEncoder<T>::put_batch(const T* in_data, size_t count) { | 47 | 6 | if (_buffered_values_num + count < FRAME_VALUE_NUM) { | 48 | 0 | copy_value(in_data, count); | 49 | 0 | _values_num += count; | 50 | 0 | return; | 51 | 0 | } | 52 | | | 53 | | // 1. padding one frame | 54 | 6 | size_t padding_num = FRAME_VALUE_NUM - _buffered_values_num; | 55 | 6 | in_data = copy_value(in_data, padding_num); | 56 | 6 | bit_packing_one_frame_value(_buffered_values); | 57 | | | 58 | | // 2. process frame by frame | 59 | 6 | size_t frame_size = (count - padding_num) / FRAME_VALUE_NUM; | 60 | 12 | for (size_t i = 0; i < frame_size; i++) { | 61 | | // directly encode value to the bit_writer, don't buffer the value | 62 | 6 | _buffered_values_num = FRAME_VALUE_NUM; | 63 | 6 | bit_packing_one_frame_value(in_data); | 64 | 6 | in_data += FRAME_VALUE_NUM; | 65 | 6 | } | 66 | | | 67 | | // 3. process remaining value | 68 | 6 | size_t remaining_num = (count - padding_num) % FRAME_VALUE_NUM; | 69 | 6 | if (remaining_num > 0) { | 70 | 0 | copy_value(in_data, remaining_num); | 71 | 0 | } | 72 | | | 73 | 6 | _values_num += count; | 74 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE9put_batchEPKmm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE9put_batchEPKS1_m Unexecuted instantiation: _ZN5doris10ForEncoderIoE9put_batchEPKom |
75 | | |
76 | | // todo(kks): improve this method by SIMD instructions |
77 | | |
78 | | template <typename T> |
79 | 30.6k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
80 | 30.6k | int64_t s = 0; |
81 | 30.6k | uint8_t output_mask = 255; |
82 | 30.6k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 |
83 | 30.6k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 |
84 | | |
85 | 475k | for (int i = 0; i < full_batch_size; i += 8) { |
86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit |
87 | 445k | s |= static_cast<int64_t>(input[i + 7]); |
88 | 445k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; |
89 | 445k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); |
90 | 445k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); |
91 | 445k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); |
92 | 445k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); |
93 | 445k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); |
94 | 445k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); |
95 | | |
96 | | // Starting with the highest valid bit, take out 8 bits in sequence |
97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end |
99 | 2.44M | for (int j = 0; j < bit_width; j++) { |
100 | 2.00M | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; |
101 | 2.00M | } |
102 | 445k | output += bit_width; |
103 | 445k | s = 0; |
104 | 445k | } |
105 | | |
106 | | // remainder |
107 | 30.6k | int byte = tail_count * bit_width; // How many bits are left to store |
108 | 30.6k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input |
109 | | |
110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit |
111 | 130k | for (int i = 0; i < tail_count; i++) { |
112 | 100k | s |= (static_cast<int64_t>(input[i + full_batch_size])) |
113 | 100k | << ((tail_count - i - 1) * bit_width); |
114 | 100k | } |
115 | | |
116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end |
117 | 30.6k | s <<= (bytes << 3) - byte; |
118 | | |
119 | | // Starting with the highest valid bit, take out 8 bits in sequence |
120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. |
121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end |
122 | 96.8k | for (int i = 0; i < bytes; i++) { |
123 | 66.2k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; |
124 | 66.2k | } |
125 | 30.6k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_8EPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_8EPKshiPh _ZN5doris10ForEncoderIiE10bit_pack_8EPKihiPh Line | Count | Source | 79 | 16 | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 16 | int64_t s = 0; | 81 | 16 | uint8_t output_mask = 255; | 82 | 16 | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 16 | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 208 | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 192 | s |= static_cast<int64_t>(input[i + 7]); | 88 | 192 | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 192 | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 192 | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 192 | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 192 | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 192 | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 192 | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 384 | for (int j = 0; j < bit_width; j++) { | 100 | 192 | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 192 | } | 102 | 192 | output += bit_width; | 103 | 192 | s = 0; | 104 | 192 | } | 105 | | | 106 | | // remainder | 107 | 16 | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 16 | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 20 | for (int i = 0; i < tail_count; i++) { | 112 | 4 | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 4 | << ((tail_count - i - 1) * bit_width); | 114 | 4 | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 16 | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 18 | for (int i = 0; i < bytes; i++) { | 123 | 2 | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 2 | } | 125 | 16 | } |
_ZN5doris10ForEncoderIlE10bit_pack_8EPKlhiPh Line | Count | Source | 79 | 6.10k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 6.10k | int64_t s = 0; | 81 | 6.10k | uint8_t output_mask = 255; | 82 | 6.10k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 6.10k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 69.8k | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 63.7k | s |= static_cast<int64_t>(input[i + 7]); | 88 | 63.7k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 63.7k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 63.7k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 63.7k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 63.7k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 63.7k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 63.7k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 349k | for (int j = 0; j < bit_width; j++) { | 100 | 285k | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 285k | } | 102 | 63.7k | output += bit_width; | 103 | 63.7k | s = 0; | 104 | 63.7k | } | 105 | | | 106 | | // remainder | 107 | 6.10k | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 6.10k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 20.4k | for (int i = 0; i < tail_count; i++) { | 112 | 14.3k | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 14.3k | << ((tail_count - i - 1) * bit_width); | 114 | 14.3k | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 6.10k | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 15.5k | for (int i = 0; i < bytes; i++) { | 123 | 9.44k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 9.44k | } | 125 | 6.10k | } |
_ZN5doris10ForEncoderInE10bit_pack_8EPKnhiPh Line | Count | Source | 79 | 24.4k | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 24.4k | int64_t s = 0; | 81 | 24.4k | uint8_t output_mask = 255; | 82 | 24.4k | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 24.4k | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 405k | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 380k | s |= static_cast<int64_t>(input[i + 7]); | 88 | 380k | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 380k | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 380k | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 380k | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 380k | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 380k | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 380k | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 2.09M | for (int j = 0; j < bit_width; j++) { | 100 | 1.71M | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 1.71M | } | 102 | 380k | output += bit_width; | 103 | 380k | s = 0; | 104 | 380k | } | 105 | | | 106 | | // remainder | 107 | 24.4k | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 24.4k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 110k | for (int i = 0; i < tail_count; i++) { | 112 | 86.0k | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 86.0k | << ((tail_count - i - 1) * bit_width); | 114 | 86.0k | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 24.4k | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 81.3k | for (int i = 0; i < bytes; i++) { | 123 | 56.8k | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 56.8k | } | 125 | 24.4k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_8EPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_8EPKthiPh _ZN5doris10ForEncoderIjE10bit_pack_8EPKjhiPh Line | Count | Source | 79 | 12 | void ForEncoder<T>::bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 80 | 12 | int64_t s = 0; | 81 | 12 | uint8_t output_mask = 255; | 82 | 12 | int tail_count = in_num & 7; // the remainder of in_num modulo 8 | 83 | 12 | int full_batch_size = (in_num >> 3) << 3; // Adjust in_num to a multiple of 8 | 84 | | | 85 | 204 | for (int i = 0; i < full_batch_size; i += 8) { | 86 | | // Put the 8 numbers in the input into s in order, each number occupies bit_width bit | 87 | 192 | s |= static_cast<int64_t>(input[i + 7]); | 88 | 192 | s |= (static_cast<int64_t>(input[i + 6])) << bit_width; | 89 | 192 | s |= (static_cast<int64_t>(input[i + 5])) << (2 * bit_width); | 90 | 192 | s |= (static_cast<int64_t>(input[i + 4])) << (3 * bit_width); | 91 | 192 | s |= (static_cast<int64_t>(input[i + 3])) << (4 * bit_width); | 92 | 192 | s |= (static_cast<int64_t>(input[i + 2])) << (5 * bit_width); | 93 | 192 | s |= (static_cast<int64_t>(input[i + 1])) << (6 * bit_width); | 94 | 192 | s |= (static_cast<int64_t>(input[i])) << (7 * bit_width); | 95 | | | 96 | | // Starting with the highest valid bit, take out 8 bits in sequence | 97 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 98 | | // (bit_width - j - 1) << 3 used to calculate how many bits need to be removed at the end | 99 | 384 | for (int j = 0; j < bit_width; j++) { | 100 | 192 | output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask; | 101 | 192 | } | 102 | 192 | output += bit_width; | 103 | 192 | s = 0; | 104 | 192 | } | 105 | | | 106 | | // remainder | 107 | 12 | int byte = tail_count * bit_width; // How many bits are left to store | 108 | 12 | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 109 | | | 110 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 111 | 12 | for (int i = 0; i < tail_count; i++) { | 112 | 0 | s |= (static_cast<int64_t>(input[i + full_batch_size])) | 113 | 0 | << ((tail_count - i - 1) * bit_width); | 114 | 0 | } | 115 | | | 116 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 117 | 12 | s <<= (bytes << 3) - byte; | 118 | | | 119 | | // Starting with the highest valid bit, take out 8 bits in sequence | 120 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 121 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 122 | 12 | for (int i = 0; i < bytes; i++) { | 123 | 0 | output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask; | 124 | 0 | } | 125 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_8EPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_8EPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_8EPKohiPh |
126 | | |
127 | | template <typename T> |
128 | | template <typename U> |
129 | 91.6k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
130 | 91.6k | U s = 0; |
131 | 91.6k | uint8_t output_mask = 255; |
132 | 91.6k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 |
133 | 91.6k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 |
134 | 91.6k | int output_size = 0; // How many outputs can be processed at a time |
135 | 91.6k | int bit_width_remainder = |
136 | 91.6k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time |
137 | 91.6k | int extra_bit = 0; // Extra bits after each process |
138 | | |
139 | 2.80M | for (int i = 0; i < full_batch_size; i += 4) { |
140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit |
141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop |
142 | 2.70M | s <<= bit_width; |
143 | 2.70M | s |= (static_cast<U>(input[i])); |
144 | 2.70M | s <<= bit_width; |
145 | 2.70M | s |= (static_cast<U>(input[i + 1])); |
146 | 2.70M | s <<= bit_width; |
147 | 2.70M | s |= (static_cast<U>(input[i + 2])); |
148 | 2.70M | s <<= bit_width; |
149 | 2.70M | s |= (static_cast<U>(input[i + 3])); |
150 | | |
151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, |
152 | | // and there are extra_bit bits left over from the last loop, |
153 | | // divide by 8 to calculate how much output can be processed in this loop. |
154 | 2.70M | output_size = ((bit_width << 2) + extra_bit) >> 3; |
155 | | |
156 | | // Each loop will leave bit_width_remainder bit unprocessed, |
157 | | // last loop will leave extra_bit bit, eventually will leave |
158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed |
159 | 2.70M | extra_bit = (extra_bit + bit_width_remainder) & 7; |
160 | | |
161 | | // Starting with the highest valid bit, take out 8 bits in sequence |
162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end |
164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit |
165 | 30.4M | for (int j = 0; j < output_size; j++) { |
166 | 27.7M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; |
167 | 27.7M | } |
168 | 2.70M | output += output_size; |
169 | | |
170 | | // s retains the post extra_bit bit as it is not processed |
171 | 2.70M | s &= (1 << extra_bit) - 1; |
172 | 2.70M | } |
173 | | |
174 | | // remainder |
175 | 91.6k | int byte = tail_count * bit_width; // How many bits are left to store |
176 | 91.6k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed |
177 | 91.6k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input |
178 | | |
179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit |
180 | 220k | for (int i = 0; i < tail_count; i++) { |
181 | 128k | s <<= bit_width; |
182 | 128k | s |= (input[i + full_batch_size]); |
183 | 128k | } |
184 | | |
185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end |
186 | 91.6k | s <<= (bytes << 3) - byte; |
187 | | |
188 | | // Starting with the highest valid bit, take out 8 bits in sequence |
189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. |
190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end |
191 | 462k | for (int i = 0; i < bytes; i++) { |
192 | 370k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; |
193 | 370k | } |
194 | 91.6k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4IlEEvPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_4InEEvPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4IlEEvPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_4InEEvPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4IlEEvPKihiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_4InEEvPKihiPh _ZN5doris10ForEncoderIlE10bit_pack_4IlEEvPKlhiPh Line | Count | Source | 129 | 6.07k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 6.07k | U s = 0; | 131 | 6.07k | uint8_t output_mask = 255; | 132 | 6.07k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 6.07k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 6.07k | int output_size = 0; // How many outputs can be processed at a time | 135 | 6.07k | int bit_width_remainder = | 136 | 6.07k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 6.07k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 135k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 129k | s <<= bit_width; | 143 | 129k | s |= (static_cast<U>(input[i])); | 144 | 129k | s <<= bit_width; | 145 | 129k | s |= (static_cast<U>(input[i + 1])); | 146 | 129k | s <<= bit_width; | 147 | 129k | s |= (static_cast<U>(input[i + 2])); | 148 | 129k | s <<= bit_width; | 149 | 129k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 129k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 129k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 934k | for (int j = 0; j < output_size; j++) { | 166 | 805k | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 805k | } | 168 | 129k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 129k | s &= (1 << extra_bit) - 1; | 172 | 129k | } | 173 | | | 174 | | // remainder | 175 | 6.07k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 6.07k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 6.07k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 12.1k | for (int i = 0; i < tail_count; i++) { | 181 | 6.09k | s <<= bit_width; | 182 | 6.09k | s |= (input[i + full_batch_size]); | 183 | 6.09k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 6.07k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 17.5k | for (int i = 0; i < bytes; i++) { | 192 | 11.4k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 11.4k | } | 194 | 6.07k | } |
_ZN5doris10ForEncoderIlE10bit_pack_4InEEvPKlhiPh Line | Count | Source | 129 | 12.1k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 12.1k | U s = 0; | 131 | 12.1k | uint8_t output_mask = 255; | 132 | 12.1k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 12.1k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 12.1k | int output_size = 0; // How many outputs can be processed at a time | 135 | 12.1k | int bit_width_remainder = | 136 | 12.1k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 12.1k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 270k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 258k | s <<= bit_width; | 143 | 258k | s |= (static_cast<U>(input[i])); | 144 | 258k | s <<= bit_width; | 145 | 258k | s |= (static_cast<U>(input[i + 1])); | 146 | 258k | s <<= bit_width; | 147 | 258k | s |= (static_cast<U>(input[i + 2])); | 148 | 258k | s <<= bit_width; | 149 | 258k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 258k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 258k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 3.41M | for (int j = 0; j < output_size; j++) { | 166 | 3.16M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 3.16M | } | 168 | 258k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 258k | s &= (1 << extra_bit) - 1; | 172 | 258k | } | 173 | | | 174 | | // remainder | 175 | 12.1k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 12.1k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 12.1k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 24.4k | for (int i = 0; i < tail_count; i++) { | 181 | 12.2k | s <<= bit_width; | 182 | 12.2k | s |= (input[i + full_batch_size]); | 183 | 12.2k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 12.1k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 53.4k | for (int i = 0; i < bytes; i++) { | 192 | 41.3k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 41.3k | } | 194 | 12.1k | } |
_ZN5doris10ForEncoderInE10bit_pack_4IlEEvPKnhiPh Line | Count | Source | 129 | 24.4k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 24.4k | U s = 0; | 131 | 24.4k | uint8_t output_mask = 255; | 132 | 24.4k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 24.4k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 24.4k | int output_size = 0; // How many outputs can be processed at a time | 135 | 24.4k | int bit_width_remainder = | 136 | 24.4k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 24.4k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 798k | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 774k | s <<= bit_width; | 143 | 774k | s |= (static_cast<U>(input[i])); | 144 | 774k | s <<= bit_width; | 145 | 774k | s |= (static_cast<U>(input[i + 1])); | 146 | 774k | s <<= bit_width; | 147 | 774k | s |= (static_cast<U>(input[i + 2])); | 148 | 774k | s <<= bit_width; | 149 | 774k | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 774k | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 774k | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 5.60M | for (int j = 0; j < output_size; j++) { | 166 | 4.83M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 4.83M | } | 168 | 774k | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 774k | s &= (1 << extra_bit) - 1; | 172 | 774k | } | 173 | | | 174 | | // remainder | 175 | 24.4k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 24.4k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 24.4k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 61.3k | for (int i = 0; i < tail_count; i++) { | 181 | 36.8k | s <<= bit_width; | 182 | 36.8k | s |= (input[i + full_batch_size]); | 183 | 36.8k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 24.4k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 93.6k | for (int i = 0; i < bytes; i++) { | 192 | 69.1k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 69.1k | } | 194 | 24.4k | } |
_ZN5doris10ForEncoderInE10bit_pack_4InEEvPKnhiPh Line | Count | Source | 129 | 48.9k | void ForEncoder<T>::bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 130 | 48.9k | U s = 0; | 131 | 48.9k | uint8_t output_mask = 255; | 132 | 48.9k | int tail_count = in_num & 3; // the remainder of in_num modulo 4 | 133 | 48.9k | int full_batch_size = (in_num >> 2) << 2; // Adjust in_num to a multiple of 4 | 134 | 48.9k | int output_size = 0; // How many outputs can be processed at a time | 135 | 48.9k | int bit_width_remainder = | 136 | 48.9k | (bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time | 137 | 48.9k | int extra_bit = 0; // Extra bits after each process | 138 | | | 139 | 1.59M | for (int i = 0; i < full_batch_size; i += 4) { | 140 | | // Put the 4 numbers in the input into s in order, each number occupies bit_width bit | 141 | | // The reason for using s<<=bit_width first is that there are unprocessed bits in the previous loop | 142 | 1.54M | s <<= bit_width; | 143 | 1.54M | s |= (static_cast<U>(input[i])); | 144 | 1.54M | s <<= bit_width; | 145 | 1.54M | s |= (static_cast<U>(input[i + 1])); | 146 | 1.54M | s <<= bit_width; | 147 | 1.54M | s |= (static_cast<U>(input[i + 2])); | 148 | 1.54M | s <<= bit_width; | 149 | 1.54M | s |= (static_cast<U>(input[i + 3])); | 150 | | | 151 | | // ((bit_width * 4) + extra_bit) / 8: There are bit_width*4 bits to be processed in s, | 152 | | // and there are extra_bit bits left over from the last loop, | 153 | | // divide by 8 to calculate how much output can be processed in this loop. | 154 | 1.54M | output_size = ((bit_width << 2) + extra_bit) >> 3; | 155 | | | 156 | | // Each loop will leave bit_width_remainder bit unprocessed, | 157 | | // last loop will leave extra_bit bit, eventually will leave | 158 | | // (extra_bit + bit_width_remainder) & 7 bit unprocessed | 159 | 1.54M | extra_bit = (extra_bit + bit_width_remainder) & 7; | 160 | | | 161 | | // Starting with the highest valid bit, take out 8 bits in sequence | 162 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 163 | | // (output_size-j-1)<<3 used to calculate how many bits need to be removed at the end | 164 | | // But since there are still extra_bit bits that can't be processed, need to add the extra_bit | 165 | 20.5M | for (int j = 0; j < output_size; j++) { | 166 | 18.9M | output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask; | 167 | 18.9M | } | 168 | 1.54M | output += output_size; | 169 | | | 170 | | // s retains the post extra_bit bit as it is not processed | 171 | 1.54M | s &= (1 << extra_bit) - 1; | 172 | 1.54M | } | 173 | | | 174 | | // remainder | 175 | 48.9k | int byte = tail_count * bit_width; // How many bits are left to store | 176 | 48.9k | if (extra_bit != 0) byte += extra_bit; // add extra_bit bit as it is not processed | 177 | 48.9k | int bytes = (byte + 7) >> 3; // How many more bytes are needed to store the rest of input | 178 | | | 179 | | // Put the tail_count numbers in the input into s in order, each number occupies bit_width bit | 180 | 122k | for (int i = 0; i < tail_count; i++) { | 181 | 73.7k | s <<= bit_width; | 182 | 73.7k | s |= (input[i + full_batch_size]); | 183 | 73.7k | } | 184 | | | 185 | | // If byte is not a multiple of 8 and therefore needs to be padded with 0 at the end | 186 | 48.9k | s <<= (bytes << 3) - byte; | 187 | | | 188 | | // Starting with the highest valid bit, take out 8 bits in sequence | 189 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid. | 190 | | // (bytes - i - 1) << 3 used to calculate how many bits need to be removed at the end | 191 | 297k | for (int i = 0; i < bytes; i++) { | 192 | 248k | output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask; | 193 | 248k | } | 194 | 48.9k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4IlEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_4InEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4IlEEvPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_4InEEvPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4IlEEvPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_4InEEvPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4IlEEvPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_4InEEvPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4IlEEvPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_4InEEvPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4IlEEvPKohiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_4InEEvPKohiPh |
195 | | |
196 | | template <typename T> |
197 | 363k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
198 | 363k | int output_mask = 255; |
199 | 363k | int need_bit = 0; // still need |
200 | | |
201 | 43.8M | for (int i = 0; i < in_num; i++) { |
202 | 43.4M | T x = input[i]; |
203 | 43.4M | int width = bit_width; |
204 | 43.4M | if (need_bit) { |
205 | | // The last time we take away the high 8 - need_bit, |
206 | | // we need to make up the rest of the need_bit from the width. |
207 | | // Use width - need_bit to compute high need_bit bits |
208 | 30.0M | *output |= x >> (width - need_bit); |
209 | 30.0M | output++; |
210 | | // There are need_bit bits being used, so subtract |
211 | 30.0M | width -= need_bit; |
212 | 30.0M | } |
213 | 43.4M | int num = width >> 3; // How many outputs can be processed at a time |
214 | 43.4M | int remainder = width & 7; // How many bits are left to store |
215 | | |
216 | | // Starting with the highest valid bit, take out 8 bits in sequence |
217 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid |
218 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end |
219 | | // But since there are still remainder bits that can't be processed, need to add the remainder |
220 | 447M | for (int j = 0; j < num; j++) { |
221 | 404M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); |
222 | 404M | output++; |
223 | 404M | } |
224 | 43.4M | if (remainder) { |
225 | | // Process the last remaining remainder bit. |
226 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. |
227 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit |
228 | 30.3M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); |
229 | | // Already have remainder bits, next time need 8-remainder bits |
230 | 30.3M | need_bit = 8 - remainder; |
231 | 30.3M | } else { |
232 | 13.1M | need_bit = 0; |
233 | 13.1M | } |
234 | 43.4M | } |
235 | 363k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE10bit_pack_1EPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE10bit_pack_1EPKshiPh Unexecuted instantiation: _ZN5doris10ForEncoderIiE10bit_pack_1EPKihiPh _ZN5doris10ForEncoderIlE10bit_pack_1EPKlhiPh Line | Count | Source | 197 | 24.3k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 198 | 24.3k | int output_mask = 255; | 199 | 24.3k | int need_bit = 0; // still need | 200 | | | 201 | 2.11M | for (int i = 0; i < in_num; i++) { | 202 | 2.08M | T x = input[i]; | 203 | 2.08M | int width = bit_width; | 204 | 2.08M | if (need_bit) { | 205 | | // The last time we take away the high 8 - need_bit, | 206 | | // we need to make up the rest of the need_bit from the width. | 207 | | // Use width - need_bit to compute high need_bit bits | 208 | 1.48M | *output |= x >> (width - need_bit); | 209 | 1.48M | output++; | 210 | | // There are need_bit bits being used, so subtract | 211 | 1.48M | width -= need_bit; | 212 | 1.48M | } | 213 | 2.08M | int num = width >> 3; // How many outputs can be processed at a time | 214 | 2.08M | int remainder = width & 7; // How many bits are left to store | 215 | | | 216 | | // Starting with the highest valid bit, take out 8 bits in sequence | 217 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 218 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end | 219 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 220 | 13.2M | for (int j = 0; j < num; j++) { | 221 | 11.1M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); | 222 | 11.1M | output++; | 223 | 11.1M | } | 224 | 2.08M | if (remainder) { | 225 | | // Process the last remaining remainder bit. | 226 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. | 227 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit | 228 | 1.49M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); | 229 | | // Already have remainder bits, next time need 8-remainder bits | 230 | 1.49M | need_bit = 8 - remainder; | 231 | 1.49M | } else { | 232 | 589k | need_bit = 0; | 233 | 589k | } | 234 | 2.08M | } | 235 | 24.3k | } |
_ZN5doris10ForEncoderInE10bit_pack_1EPKnhiPh Line | Count | Source | 197 | 339k | void ForEncoder<T>::bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 198 | 339k | int output_mask = 255; | 199 | 339k | int need_bit = 0; // still need | 200 | | | 201 | 41.7M | for (int i = 0; i < in_num; i++) { | 202 | 41.3M | T x = input[i]; | 203 | 41.3M | int width = bit_width; | 204 | 41.3M | if (need_bit) { | 205 | | // The last time we take away the high 8 - need_bit, | 206 | | // we need to make up the rest of the need_bit from the width. | 207 | | // Use width - need_bit to compute high need_bit bits | 208 | 28.6M | *output |= x >> (width - need_bit); | 209 | 28.6M | output++; | 210 | | // There are need_bit bits being used, so subtract | 211 | 28.6M | width -= need_bit; | 212 | 28.6M | } | 213 | 41.3M | int num = width >> 3; // How many outputs can be processed at a time | 214 | 41.3M | int remainder = width & 7; // How many bits are left to store | 215 | | | 216 | | // Starting with the highest valid bit, take out 8 bits in sequence | 217 | | // perform an AND operation with output_mask to ensure that only 8 bits are valid | 218 | | // (num-j-1)<<3 used to calculate how many bits need to be removed at the end | 219 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 220 | 434M | for (int j = 0; j < num; j++) { | 221 | 393M | *output = cast_set<uint8_t>((x >> (((num - j - 1) << 3) + remainder)) & output_mask); | 222 | 393M | output++; | 223 | 393M | } | 224 | 41.3M | if (remainder) { | 225 | | // Process the last remaining remainder bit. | 226 | | // y = (x & ((1 << remainder) - 1)) extract the last remainder bits. | 227 | | // ouput = y << (8 - reaminder) Use the high 8 - remainder bit | 228 | 28.8M | *output = cast_set<uint8_t>((x & ((1 << remainder) - 1)) << (8 - remainder)); | 229 | | // Already have remainder bits, next time need 8-remainder bits | 230 | 28.8M | need_bit = 8 - remainder; | 231 | 28.8M | } else { | 232 | 12.5M | need_bit = 0; | 233 | 12.5M | } | 234 | 41.3M | } | 235 | 339k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE10bit_pack_1EPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE10bit_pack_1EPKthiPh Unexecuted instantiation: _ZN5doris10ForEncoderIjE10bit_pack_1EPKjhiPh Unexecuted instantiation: _ZN5doris10ForEncoderImE10bit_pack_1EPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE10bit_pack_1EPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE10bit_pack_1EPKohiPh |
236 | | |
237 | | // Use as few bit as possible to store a piece of integer data. |
238 | | // param[in] input: the integer list need to pack |
239 | | // param[in] in_num: the number integer need to pack |
240 | | // param[in] bit_width: how many bit we use to store each integer data |
241 | | // param[out] out: the packed result |
242 | | |
243 | | // For example: |
244 | | // The input is int32 list: 1, 2, 4, 8 and bit_width is 4 |
245 | | // The output will be: 0001 0010 0100 1000 |
246 | | template <typename T> |
247 | 486k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { |
248 | 486k | if (in_num == 0 || bit_width == 0) { |
249 | 520 | return; |
250 | 520 | } |
251 | | /* |
252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 |
253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 |
254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 |
255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) |
256 | | bit_width <= 64 : pack_1 > pack_4 |
257 | | */ |
258 | 485k | if (bit_width <= 8) { |
259 | 30.6k | bit_pack_8(input, in_num, bit_width, output); |
260 | 455k | } else if (bit_width <= 16) { |
261 | 30.5k | bit_pack_4<int64_t>(input, in_num, bit_width, output); |
262 | 424k | } else if (bit_width <= 32) { |
263 | 61.1k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); |
264 | 363k | } else { |
265 | 363k | bit_pack_1(input, in_num, bit_width, output); |
266 | 363k | } |
267 | 485k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE8bit_packEPKahiPh Unexecuted instantiation: _ZN5doris10ForEncoderIsE8bit_packEPKshiPh _ZN5doris10ForEncoderIiE8bit_packEPKihiPh Line | Count | Source | 247 | 18 | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 18 | if (in_num == 0 || bit_width == 0) { | 249 | 2 | return; | 250 | 2 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 16 | if (bit_width <= 8) { | 259 | 16 | bit_pack_8(input, in_num, bit_width, output); | 260 | 16 | } else if (bit_width <= 16) { | 261 | 0 | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 0 | } else if (bit_width <= 32) { | 263 | 0 | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 0 | } else { | 265 | 0 | bit_pack_1(input, in_num, bit_width, output); | 266 | 0 | } | 267 | 16 | } |
_ZN5doris10ForEncoderIlE8bit_packEPKlhiPh Line | Count | Source | 247 | 48.9k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 48.9k | if (in_num == 0 || bit_width == 0) { | 249 | 262 | return; | 250 | 262 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 48.6k | if (bit_width <= 8) { | 259 | 6.10k | bit_pack_8(input, in_num, bit_width, output); | 260 | 42.5k | } else if (bit_width <= 16) { | 261 | 6.07k | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 36.4k | } else if (bit_width <= 32) { | 263 | 12.1k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 24.3k | } else { | 265 | 24.3k | bit_pack_1(input, in_num, bit_width, output); | 266 | 24.3k | } | 267 | 48.6k | } |
_ZN5doris10ForEncoderInE8bit_packEPKnhiPh Line | Count | Source | 247 | 437k | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 437k | if (in_num == 0 || bit_width == 0) { | 249 | 256 | return; | 250 | 256 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 437k | if (bit_width <= 8) { | 259 | 24.4k | bit_pack_8(input, in_num, bit_width, output); | 260 | 412k | } else if (bit_width <= 16) { | 261 | 24.4k | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 388k | } else if (bit_width <= 32) { | 263 | 48.9k | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 339k | } else { | 265 | 339k | bit_pack_1(input, in_num, bit_width, output); | 266 | 339k | } | 267 | 437k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE8bit_packEPKhhiPh Unexecuted instantiation: _ZN5doris10ForEncoderItE8bit_packEPKthiPh _ZN5doris10ForEncoderIjE8bit_packEPKjhiPh Line | Count | Source | 247 | 12 | void ForEncoder<T>::bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output) { | 248 | 12 | if (in_num == 0 || bit_width == 0) { | 249 | 0 | return; | 250 | 0 | } | 251 | | /* | 252 | | bit_width <= 8 : pack_8 > pack_16 > pack_32 | 253 | | bit_width <= 16 : pack_4 > pack_8 > pack_16 | 254 | | bit_width <= 32 : pack_4 >= pack_2 > pack_8 | 255 | | (pack_4 and pack_2 have nearly similar execution times, but pack_4 utilizes space more efficiently) | 256 | | bit_width <= 64 : pack_1 > pack_4 | 257 | | */ | 258 | 12 | if (bit_width <= 8) { | 259 | 12 | bit_pack_8(input, in_num, bit_width, output); | 260 | 12 | } else if (bit_width <= 16) { | 261 | 0 | bit_pack_4<int64_t>(input, in_num, bit_width, output); | 262 | 0 | } else if (bit_width <= 32) { | 263 | 0 | bit_pack_4<__int128_t>(input, in_num, bit_width, output); | 264 | 0 | } else { | 265 | 0 | bit_pack_1(input, in_num, bit_width, output); | 266 | 0 | } | 267 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE8bit_packEPKmhiPh Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE8bit_packEPKS1_hiPh Unexecuted instantiation: _ZN5doris10ForEncoderIoE8bit_packEPKohiPh |
268 | | |
269 | | template <typename T> |
270 | 97.8k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { |
271 | 97.8k | T min = input[0]; |
272 | 97.8k | T max = input[0]; |
273 | 97.8k | bool is_ascending = true; |
274 | 97.8k | uint8_t bit_width = 0; |
275 | 97.8k | T half_max_delta = numeric_limits_max() >> 1; |
276 | 97.8k | bool is_keep_original_value = false; |
277 | | |
278 | | // 1. make sure order_flag, save_original_value, and find max&min. |
279 | 8.36M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { |
280 | 8.26M | if (is_ascending) { |
281 | 172k | if (input[i] < input[i - 1]) { |
282 | 96.9k | is_ascending = false; |
283 | 96.9k | } else { |
284 | 76.0k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow |
285 | 0 | is_keep_original_value = true; |
286 | 76.0k | } else { |
287 | 76.0k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); |
288 | 76.0k | } |
289 | 76.0k | } |
290 | 172k | } |
291 | | |
292 | 8.26M | if (input[i] < min) { |
293 | 361k | min = input[i]; |
294 | 361k | continue; |
295 | 361k | } |
296 | | |
297 | 7.90M | if (input[i] > max) { |
298 | 367k | max = input[i]; |
299 | 367k | } |
300 | 7.90M | } |
301 | 97.8k | if (!is_ascending) { |
302 | 96.9k | if ((max >> 1) - (min >> 1) > half_max_delta) { |
303 | 0 | is_keep_original_value = true; |
304 | 0 | } |
305 | 96.9k | } |
306 | | |
307 | | // 2. save min value. |
308 | 97.8k | if (sizeof(T) == 16) { |
309 | 48.8k | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); |
310 | 48.9k | } else if (sizeof(T) == 8) { |
311 | 48.9k | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); |
312 | 48.9k | } else { |
313 | 30 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); |
314 | 30 | } |
315 | | |
316 | | // 3.1 save original value. |
317 | 97.8k | if (is_keep_original_value) { |
318 | 0 | bit_width = sizeof(T) * 8; |
319 | 0 | uint32_t len = _buffered_values_num * bit_width; |
320 | 0 | _buffer->reserve(_buffer->size() + len); |
321 | 0 | size_t origin_size = _buffer->size(); |
322 | 0 | _buffer->resize(origin_size + len); |
323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); |
324 | 97.8k | } else { |
325 | | // 3.2 bit pack. |
326 | | // improve for ascending order input, we could use fewer bit |
327 | 97.8k | T delta_values[FRAME_VALUE_NUM]; |
328 | 97.8k | if (is_ascending) { |
329 | 898 | delta_values[0] = 0; |
330 | 6.34k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { |
331 | 5.45k | delta_values[i] = input[i] - input[i - 1]; |
332 | 5.45k | } |
333 | 96.9k | } else { |
334 | 96.9k | bit_width = bits(static_cast<T>(max - min)); |
335 | 8.45M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { |
336 | 8.35M | delta_values[i] = input[i] - min; |
337 | 8.35M | } |
338 | 96.9k | } |
339 | | |
340 | 97.8k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); |
341 | | |
342 | 97.8k | _buffer->reserve(_buffer->size() + packing_len); |
343 | 97.8k | size_t origin_size = _buffer->size(); |
344 | 97.8k | _buffer->resize(origin_size + packing_len); |
345 | 97.8k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); |
346 | 97.8k | } |
347 | 97.8k | uint8_t storage_format = 0; |
348 | 97.8k | if (is_keep_original_value) { |
349 | 0 | storage_format = 2; |
350 | 97.8k | } else if (is_ascending) { |
351 | 898 | storage_format = 1; |
352 | 898 | } |
353 | 97.8k | _storage_formats.push_back(storage_format); |
354 | 97.8k | _bit_widths.push_back(bit_width); |
355 | | |
356 | 97.8k | _buffered_values_num = 0; |
357 | 97.8k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE27bit_packing_one_frame_valueEPKa Unexecuted instantiation: _ZN5doris10ForEncoderIsE27bit_packing_one_frame_valueEPKs _ZN5doris10ForEncoderIiE27bit_packing_one_frame_valueEPKi Line | Count | Source | 270 | 18 | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 18 | T min = input[0]; | 272 | 18 | T max = input[0]; | 273 | 18 | bool is_ascending = true; | 274 | 18 | uint8_t bit_width = 0; | 275 | 18 | T half_max_delta = numeric_limits_max() >> 1; | 276 | 18 | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 1.54k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 1.52k | if (is_ascending) { | 281 | 1.52k | if (input[i] < input[i - 1]) { | 282 | 0 | is_ascending = false; | 283 | 1.52k | } else { | 284 | 1.52k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 1.52k | } else { | 287 | 1.52k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 1.52k | } | 289 | 1.52k | } | 290 | 1.52k | } | 291 | | | 292 | 1.52k | if (input[i] < min) { | 293 | 0 | min = input[i]; | 294 | 0 | continue; | 295 | 0 | } | 296 | | | 297 | 1.52k | if (input[i] > max) { | 298 | 1.52k | max = input[i]; | 299 | 1.52k | } | 300 | 1.52k | } | 301 | 18 | if (!is_ascending) { | 302 | 0 | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 0 | } | 306 | | | 307 | | // 2. save min value. | 308 | 18 | if (sizeof(T) == 16) { | 309 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 18 | } else if (sizeof(T) == 8) { | 311 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 18 | } else { | 313 | 18 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 18 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 18 | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 18 | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 18 | T delta_values[FRAME_VALUE_NUM]; | 328 | 18 | if (is_ascending) { | 329 | 18 | delta_values[0] = 0; | 330 | 1.54k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 1.52k | delta_values[i] = input[i] - input[i - 1]; | 332 | 1.52k | } | 333 | 18 | } else { | 334 | 0 | bit_width = bits(static_cast<T>(max - min)); | 335 | 0 | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 0 | delta_values[i] = input[i] - min; | 337 | 0 | } | 338 | 0 | } | 339 | | | 340 | 18 | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 18 | _buffer->reserve(_buffer->size() + packing_len); | 343 | 18 | size_t origin_size = _buffer->size(); | 344 | 18 | _buffer->resize(origin_size + packing_len); | 345 | 18 | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 18 | } | 347 | 18 | uint8_t storage_format = 0; | 348 | 18 | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 18 | } else if (is_ascending) { | 351 | 18 | storage_format = 1; | 352 | 18 | } | 353 | 18 | _storage_formats.push_back(storage_format); | 354 | 18 | _bit_widths.push_back(bit_width); | 355 | | | 356 | 18 | _buffered_values_num = 0; | 357 | 18 | } |
_ZN5doris10ForEncoderIlE27bit_packing_one_frame_valueEPKl Line | Count | Source | 270 | 48.9k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 48.9k | T min = input[0]; | 272 | 48.9k | T max = input[0]; | 273 | 48.9k | bool is_ascending = true; | 274 | 48.9k | uint8_t bit_width = 0; | 275 | 48.9k | T half_max_delta = numeric_limits_max() >> 1; | 276 | 48.9k | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 4.17M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 4.13M | if (is_ascending) { | 281 | 86.7k | if (input[i] < input[i - 1]) { | 282 | 48.4k | is_ascending = false; | 283 | 48.4k | } else { | 284 | 38.2k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 38.2k | } else { | 287 | 38.2k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 38.2k | } | 289 | 38.2k | } | 290 | 86.7k | } | 291 | | | 292 | 4.13M | if (input[i] < min) { | 293 | 176k | min = input[i]; | 294 | 176k | continue; | 295 | 176k | } | 296 | | | 297 | 3.95M | if (input[i] > max) { | 298 | 179k | max = input[i]; | 299 | 179k | } | 300 | 3.95M | } | 301 | 48.9k | if (!is_ascending) { | 302 | 48.4k | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 48.4k | } | 306 | | | 307 | | // 2. save min value. | 308 | 48.9k | if (sizeof(T) == 16) { | 309 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 48.9k | } else if (sizeof(T) == 8) { | 311 | 48.9k | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 48.9k | } else { | 313 | 0 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 0 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 48.9k | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 48.9k | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 48.9k | T delta_values[FRAME_VALUE_NUM]; | 328 | 48.9k | if (is_ascending) { | 329 | 440 | delta_values[0] = 0; | 330 | 2.59k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 2.15k | delta_values[i] = input[i] - input[i - 1]; | 332 | 2.15k | } | 333 | 48.4k | } else { | 334 | 48.4k | bit_width = bits(static_cast<T>(max - min)); | 335 | 4.22M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 4.17M | delta_values[i] = input[i] - min; | 337 | 4.17M | } | 338 | 48.4k | } | 339 | | | 340 | 48.9k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 48.9k | _buffer->reserve(_buffer->size() + packing_len); | 343 | 48.9k | size_t origin_size = _buffer->size(); | 344 | 48.9k | _buffer->resize(origin_size + packing_len); | 345 | 48.9k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 48.9k | } | 347 | 48.9k | uint8_t storage_format = 0; | 348 | 48.9k | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 48.9k | } else if (is_ascending) { | 351 | 440 | storage_format = 1; | 352 | 440 | } | 353 | 48.9k | _storage_formats.push_back(storage_format); | 354 | 48.9k | _bit_widths.push_back(bit_width); | 355 | | | 356 | 48.9k | _buffered_values_num = 0; | 357 | 48.9k | } |
_ZN5doris10ForEncoderInE27bit_packing_one_frame_valueEPKn Line | Count | Source | 270 | 48.8k | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 48.8k | T min = input[0]; | 272 | 48.8k | T max = input[0]; | 273 | 48.8k | bool is_ascending = true; | 274 | 48.8k | uint8_t bit_width = 0; | 275 | 48.8k | T half_max_delta = numeric_limits_max() >> 1; | 276 | 48.8k | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 4.17M | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 4.12M | if (is_ascending) { | 281 | 83.1k | if (input[i] < input[i - 1]) { | 282 | 48.4k | is_ascending = false; | 283 | 48.4k | } else { | 284 | 34.7k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 34.7k | } else { | 287 | 34.7k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 34.7k | } | 289 | 34.7k | } | 290 | 83.1k | } | 291 | | | 292 | 4.12M | if (input[i] < min) { | 293 | 185k | min = input[i]; | 294 | 185k | continue; | 295 | 185k | } | 296 | | | 297 | 3.94M | if (input[i] > max) { | 298 | 185k | max = input[i]; | 299 | 185k | } | 300 | 3.94M | } | 301 | 48.8k | if (!is_ascending) { | 302 | 48.4k | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 48.4k | } | 306 | | | 307 | | // 2. save min value. | 308 | 48.8k | if (sizeof(T) == 16) { | 309 | 48.8k | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 48.8k | } else if (sizeof(T) == 8) { | 311 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 0 | } else { | 313 | 0 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 0 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 48.8k | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 48.8k | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 48.8k | T delta_values[FRAME_VALUE_NUM]; | 328 | 48.8k | if (is_ascending) { | 329 | 428 | delta_values[0] = 0; | 330 | 676 | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 248 | delta_values[i] = input[i] - input[i - 1]; | 332 | 248 | } | 333 | 48.4k | } else { | 334 | 48.4k | bit_width = bits(static_cast<T>(max - min)); | 335 | 4.22M | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 4.17M | delta_values[i] = input[i] - min; | 337 | 4.17M | } | 338 | 48.4k | } | 339 | | | 340 | 48.8k | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 48.8k | _buffer->reserve(_buffer->size() + packing_len); | 343 | 48.8k | size_t origin_size = _buffer->size(); | 344 | 48.8k | _buffer->resize(origin_size + packing_len); | 345 | 48.8k | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 48.8k | } | 347 | 48.8k | uint8_t storage_format = 0; | 348 | 48.8k | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 48.8k | } else if (is_ascending) { | 351 | 428 | storage_format = 1; | 352 | 428 | } | 353 | 48.8k | _storage_formats.push_back(storage_format); | 354 | 48.8k | _bit_widths.push_back(bit_width); | 355 | | | 356 | 48.8k | _buffered_values_num = 0; | 357 | 48.8k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE27bit_packing_one_frame_valueEPKh Unexecuted instantiation: _ZN5doris10ForEncoderItE27bit_packing_one_frame_valueEPKt _ZN5doris10ForEncoderIjE27bit_packing_one_frame_valueEPKj Line | Count | Source | 270 | 12 | void ForEncoder<T>::bit_packing_one_frame_value(const T* input) { | 271 | 12 | T min = input[0]; | 272 | 12 | T max = input[0]; | 273 | 12 | bool is_ascending = true; | 274 | 12 | uint8_t bit_width = 0; | 275 | 12 | T half_max_delta = numeric_limits_max() >> 1; | 276 | 12 | bool is_keep_original_value = false; | 277 | | | 278 | | // 1. make sure order_flag, save_original_value, and find max&min. | 279 | 1.53k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 280 | 1.52k | if (is_ascending) { | 281 | 1.52k | if (input[i] < input[i - 1]) { | 282 | 0 | is_ascending = false; | 283 | 1.52k | } else { | 284 | 1.52k | if ((input[i] >> 1) - (input[i - 1] >> 1) > half_max_delta) { // overflow | 285 | 0 | is_keep_original_value = true; | 286 | 1.52k | } else { | 287 | 1.52k | bit_width = std::max(bit_width, bits(input[i] - input[i - 1])); | 288 | 1.52k | } | 289 | 1.52k | } | 290 | 1.52k | } | 291 | | | 292 | 1.52k | if (input[i] < min) { | 293 | 0 | min = input[i]; | 294 | 0 | continue; | 295 | 0 | } | 296 | | | 297 | 1.52k | if (input[i] > max) { | 298 | 1.52k | max = input[i]; | 299 | 1.52k | } | 300 | 1.52k | } | 301 | 12 | if (!is_ascending) { | 302 | 0 | if ((max >> 1) - (min >> 1) > half_max_delta) { | 303 | 0 | is_keep_original_value = true; | 304 | 0 | } | 305 | 0 | } | 306 | | | 307 | | // 2. save min value. | 308 | 12 | if (sizeof(T) == 16) { | 309 | 0 | put_fixed128_le(_buffer, static_cast<uint128_t>(min)); | 310 | 12 | } else if (sizeof(T) == 8) { | 311 | 0 | put_fixed64_le(_buffer, static_cast<uint64_t>(min)); | 312 | 12 | } else { | 313 | 12 | put_fixed32_le(_buffer, static_cast<uint32_t>(min)); | 314 | 12 | } | 315 | | | 316 | | // 3.1 save original value. | 317 | 12 | if (is_keep_original_value) { | 318 | 0 | bit_width = sizeof(T) * 8; | 319 | 0 | uint32_t len = _buffered_values_num * bit_width; | 320 | 0 | _buffer->reserve(_buffer->size() + len); | 321 | 0 | size_t origin_size = _buffer->size(); | 322 | 0 | _buffer->resize(origin_size + len); | 323 | 0 | bit_pack(input, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 324 | 12 | } else { | 325 | | // 3.2 bit pack. | 326 | | // improve for ascending order input, we could use fewer bit | 327 | 12 | T delta_values[FRAME_VALUE_NUM]; | 328 | 12 | if (is_ascending) { | 329 | 12 | delta_values[0] = 0; | 330 | 1.53k | for (uint8_t i = 1; i < _buffered_values_num; ++i) { | 331 | 1.52k | delta_values[i] = input[i] - input[i - 1]; | 332 | 1.52k | } | 333 | 12 | } else { | 334 | 0 | bit_width = bits(static_cast<T>(max - min)); | 335 | 0 | for (uint8_t i = 0; i < _buffered_values_num; ++i) { | 336 | 0 | delta_values[i] = input[i] - min; | 337 | 0 | } | 338 | 0 | } | 339 | | | 340 | 12 | uint32_t packing_len = BitUtil::Ceil(_buffered_values_num * bit_width, 8); | 341 | | | 342 | 12 | _buffer->reserve(_buffer->size() + packing_len); | 343 | 12 | size_t origin_size = _buffer->size(); | 344 | 12 | _buffer->resize(origin_size + packing_len); | 345 | 12 | bit_pack(delta_values, _buffered_values_num, bit_width, _buffer->data() + origin_size); | 346 | 12 | } | 347 | 12 | uint8_t storage_format = 0; | 348 | 12 | if (is_keep_original_value) { | 349 | 0 | storage_format = 2; | 350 | 12 | } else if (is_ascending) { | 351 | 12 | storage_format = 1; | 352 | 12 | } | 353 | 12 | _storage_formats.push_back(storage_format); | 354 | 12 | _bit_widths.push_back(bit_width); | 355 | | | 356 | 12 | _buffered_values_num = 0; | 357 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE27bit_packing_one_frame_valueEPKm Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE27bit_packing_one_frame_valueEPKS1_ Unexecuted instantiation: _ZN5doris10ForEncoderIoE27bit_packing_one_frame_valueEPKo |
358 | | |
359 | | template <typename T> |
360 | 65.3k | uint32_t ForEncoder<T>::flush() { |
361 | 65.3k | if (_buffered_values_num != 0) { |
362 | 65.0k | bit_packing_one_frame_value(_buffered_values); |
363 | 65.0k | } |
364 | | |
365 | | // write the footer: |
366 | | // 1 _storage_formats and bit_widths |
367 | 65.3k | DCHECK(_storage_formats.size() == _bit_widths.size()) |
368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; |
369 | 163k | for (size_t i = 0; i < _storage_formats.size(); i++) { |
370 | 97.8k | _buffer->append(&_storage_formats[i], 1); |
371 | 97.8k | _buffer->append(&_bit_widths[i], 1); |
372 | 97.8k | } |
373 | | // 2 frame_value_num and values_num |
374 | 65.3k | uint8_t frame_value_num = FRAME_VALUE_NUM; |
375 | 65.3k | _buffer->append(&frame_value_num, 1); |
376 | 65.3k | put_fixed32_le(_buffer, _values_num); |
377 | | |
378 | 65.3k | return cast_set<uint32_t>(_buffer->size()); |
379 | 65.3k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderIsE5flushEv _ZN5doris10ForEncoderIiE5flushEv Line | Count | Source | 360 | 14 | uint32_t ForEncoder<T>::flush() { | 361 | 14 | if (_buffered_values_num != 0) { | 362 | 8 | bit_packing_one_frame_value(_buffered_values); | 363 | 8 | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 14 | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 32 | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 18 | _buffer->append(&_storage_formats[i], 1); | 371 | 18 | _buffer->append(&_bit_widths[i], 1); | 372 | 18 | } | 373 | | // 2 frame_value_num and values_num | 374 | 14 | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 14 | _buffer->append(&frame_value_num, 1); | 376 | 14 | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 14 | return cast_set<uint32_t>(_buffer->size()); | 379 | 14 | } |
_ZN5doris10ForEncoderIlE5flushEv Line | Count | Source | 360 | 32.6k | uint32_t ForEncoder<T>::flush() { | 361 | 32.6k | if (_buffered_values_num != 0) { | 362 | 32.5k | bit_packing_one_frame_value(_buffered_values); | 363 | 32.5k | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 32.6k | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 81.5k | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 48.9k | _buffer->append(&_storage_formats[i], 1); | 371 | 48.9k | _buffer->append(&_bit_widths[i], 1); | 372 | 48.9k | } | 373 | | // 2 frame_value_num and values_num | 374 | 32.6k | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 32.6k | _buffer->append(&frame_value_num, 1); | 376 | 32.6k | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 32.6k | return cast_set<uint32_t>(_buffer->size()); | 379 | 32.6k | } |
_ZN5doris10ForEncoderInE5flushEv Line | Count | Source | 360 | 32.6k | uint32_t ForEncoder<T>::flush() { | 361 | 32.6k | if (_buffered_values_num != 0) { | 362 | 32.5k | bit_packing_one_frame_value(_buffered_values); | 363 | 32.5k | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 32.6k | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 81.5k | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 48.8k | _buffer->append(&_storage_formats[i], 1); | 371 | 48.8k | _buffer->append(&_bit_widths[i], 1); | 372 | 48.8k | } | 373 | | // 2 frame_value_num and values_num | 374 | 32.6k | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 32.6k | _buffer->append(&frame_value_num, 1); | 376 | 32.6k | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 32.6k | return cast_set<uint32_t>(_buffer->size()); | 379 | 32.6k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderItE5flushEv _ZN5doris10ForEncoderIjE5flushEv Line | Count | Source | 360 | 6 | uint32_t ForEncoder<T>::flush() { | 361 | 6 | if (_buffered_values_num != 0) { | 362 | 0 | bit_packing_one_frame_value(_buffered_values); | 363 | 0 | } | 364 | | | 365 | | // write the footer: | 366 | | // 1 _storage_formats and bit_widths | 367 | 6 | DCHECK(_storage_formats.size() == _bit_widths.size()) | 368 | 0 | << "Size of _storage_formats and _bit_widths should be equal."; | 369 | 18 | for (size_t i = 0; i < _storage_formats.size(); i++) { | 370 | 12 | _buffer->append(&_storage_formats[i], 1); | 371 | 12 | _buffer->append(&_bit_widths[i], 1); | 372 | 12 | } | 373 | | // 2 frame_value_num and values_num | 374 | 6 | uint8_t frame_value_num = FRAME_VALUE_NUM; | 375 | 6 | _buffer->append(&frame_value_num, 1); | 376 | 6 | put_fixed32_le(_buffer, _values_num); | 377 | | | 378 | 6 | return cast_set<uint32_t>(_buffer->size()); | 379 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderINS_8uint24_tEE5flushEv Unexecuted instantiation: _ZN5doris10ForEncoderIoE5flushEv |
380 | | |
381 | | template <typename T> |
382 | 97.8k | const T ForEncoder<T>::numeric_limits_max() { |
383 | 97.8k | return std::numeric_limits<T>::max(); |
384 | 97.8k | } Unexecuted instantiation: _ZN5doris10ForEncoderIaE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderIsE18numeric_limits_maxEv _ZN5doris10ForEncoderIiE18numeric_limits_maxEv Line | Count | Source | 382 | 18 | const T ForEncoder<T>::numeric_limits_max() { | 383 | 18 | return std::numeric_limits<T>::max(); | 384 | 18 | } |
_ZN5doris10ForEncoderIlE18numeric_limits_maxEv Line | Count | Source | 382 | 48.9k | const T ForEncoder<T>::numeric_limits_max() { | 383 | 48.9k | return std::numeric_limits<T>::max(); | 384 | 48.9k | } |
_ZN5doris10ForEncoderInE18numeric_limits_maxEv Line | Count | Source | 382 | 48.8k | const T ForEncoder<T>::numeric_limits_max() { | 383 | 48.8k | return std::numeric_limits<T>::max(); | 384 | 48.8k | } |
Unexecuted instantiation: _ZN5doris10ForEncoderIhE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderItE18numeric_limits_maxEv _ZN5doris10ForEncoderIjE18numeric_limits_maxEv Line | Count | Source | 382 | 12 | const T ForEncoder<T>::numeric_limits_max() { | 383 | 12 | return std::numeric_limits<T>::max(); | 384 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForEncoderImE18numeric_limits_maxEv Unexecuted instantiation: _ZN5doris10ForEncoderIoE18numeric_limits_maxEv |
385 | | |
386 | | template <> |
387 | 0 | const uint24_t ForEncoder<uint24_t>::numeric_limits_max() { |
388 | 0 | return 0XFFFFFF; |
389 | 0 | } |
390 | | |
391 | | template <typename T> |
392 | 65.3k | bool ForDecoder<T>::init() { |
393 | | // When row count is zero, the minimum footer size is 5: |
394 | | // only has ValuesNum(4) + FrameValueNum(1) |
395 | 65.3k | if (_buffer_len < 5) { |
396 | 0 | return false; |
397 | 0 | } |
398 | | |
399 | 65.3k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); |
400 | 65.3k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); |
401 | 65.3k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); |
402 | 65.3k | _last_frame_size = |
403 | 65.3k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); |
404 | | |
405 | 65.3k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; |
406 | | |
407 | | // read _storage_formats, bit_widths and compute frame_offsets |
408 | 65.3k | u_int32_t frame_start_offset = 0; |
409 | 163k | for (uint32_t i = 0; i < _frame_count; i++) { |
410 | 97.8k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); |
411 | 97.8k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); |
412 | 97.8k | _bit_widths.push_back(bit_width); |
413 | 97.8k | _storage_formats.push_back(order_flag); |
414 | | |
415 | 97.8k | bit_width_offset += 2; |
416 | | |
417 | 97.8k | _frame_offsets.push_back(frame_start_offset); |
418 | 97.8k | if (sizeof(T) == 16) { |
419 | 48.8k | frame_start_offset += bit_width * _max_frame_size / 8 + 16; |
420 | 48.9k | } else if (sizeof(T) == 8) { |
421 | 48.9k | frame_start_offset += bit_width * _max_frame_size / 8 + 8; |
422 | 48.9k | } else { |
423 | 30 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; |
424 | 30 | } |
425 | 97.8k | } |
426 | | |
427 | 65.3k | _out_buffer.resize(_max_frame_size); |
428 | 65.3k | _parsed = true; |
429 | | |
430 | 65.3k | return true; |
431 | 65.3k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderIsE4initEv _ZN5doris10ForDecoderIiE4initEv Line | Count | Source | 392 | 14 | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 14 | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 14 | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 14 | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 14 | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 14 | _last_frame_size = | 403 | 14 | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 14 | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 14 | u_int32_t frame_start_offset = 0; | 409 | 32 | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 18 | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 18 | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 18 | _bit_widths.push_back(bit_width); | 413 | 18 | _storage_formats.push_back(order_flag); | 414 | | | 415 | 18 | bit_width_offset += 2; | 416 | | | 417 | 18 | _frame_offsets.push_back(frame_start_offset); | 418 | 18 | if (sizeof(T) == 16) { | 419 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 18 | } else if (sizeof(T) == 8) { | 421 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 18 | } else { | 423 | 18 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 18 | } | 425 | 18 | } | 426 | | | 427 | 14 | _out_buffer.resize(_max_frame_size); | 428 | 14 | _parsed = true; | 429 | | | 430 | 14 | return true; | 431 | 14 | } |
_ZN5doris10ForDecoderIlE4initEv Line | Count | Source | 392 | 32.6k | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 32.6k | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 32.6k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 32.6k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 32.6k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 32.6k | _last_frame_size = | 403 | 32.6k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 32.6k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 32.6k | u_int32_t frame_start_offset = 0; | 409 | 81.5k | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 48.9k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 48.9k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 48.9k | _bit_widths.push_back(bit_width); | 413 | 48.9k | _storage_formats.push_back(order_flag); | 414 | | | 415 | 48.9k | bit_width_offset += 2; | 416 | | | 417 | 48.9k | _frame_offsets.push_back(frame_start_offset); | 418 | 48.9k | if (sizeof(T) == 16) { | 419 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 48.9k | } else if (sizeof(T) == 8) { | 421 | 48.9k | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 48.9k | } else { | 423 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 0 | } | 425 | 48.9k | } | 426 | | | 427 | 32.6k | _out_buffer.resize(_max_frame_size); | 428 | 32.6k | _parsed = true; | 429 | | | 430 | 32.6k | return true; | 431 | 32.6k | } |
_ZN5doris10ForDecoderInE4initEv Line | Count | Source | 392 | 32.6k | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 32.6k | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 32.6k | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 32.6k | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 32.6k | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 32.6k | _last_frame_size = | 403 | 32.6k | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 32.6k | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 32.6k | u_int32_t frame_start_offset = 0; | 409 | 81.5k | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 48.8k | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 48.8k | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 48.8k | _bit_widths.push_back(bit_width); | 413 | 48.8k | _storage_formats.push_back(order_flag); | 414 | | | 415 | 48.8k | bit_width_offset += 2; | 416 | | | 417 | 48.8k | _frame_offsets.push_back(frame_start_offset); | 418 | 48.8k | if (sizeof(T) == 16) { | 419 | 48.8k | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 48.8k | } else if (sizeof(T) == 8) { | 421 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 0 | } else { | 423 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 0 | } | 425 | 48.8k | } | 426 | | | 427 | 32.6k | _out_buffer.resize(_max_frame_size); | 428 | 32.6k | _parsed = true; | 429 | | | 430 | 32.6k | return true; | 431 | 32.6k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderItE4initEv _ZN5doris10ForDecoderIjE4initEv Line | Count | Source | 392 | 6 | bool ForDecoder<T>::init() { | 393 | | // When row count is zero, the minimum footer size is 5: | 394 | | // only has ValuesNum(4) + FrameValueNum(1) | 395 | 6 | if (_buffer_len < 5) { | 396 | 0 | return false; | 397 | 0 | } | 398 | | | 399 | 6 | _max_frame_size = decode_fixed8(_buffer + _buffer_len - 5); | 400 | 6 | _values_num = decode_fixed32_le(_buffer + _buffer_len - 4); | 401 | 6 | _frame_count = _values_num / _max_frame_size + (_values_num % _max_frame_size != 0); | 402 | 6 | _last_frame_size = | 403 | 6 | cast_set<uint8_t>(_max_frame_size - (_max_frame_size * _frame_count - _values_num)); | 404 | | | 405 | 6 | size_t bit_width_offset = _buffer_len - 5 - _frame_count * 2; | 406 | | | 407 | | // read _storage_formats, bit_widths and compute frame_offsets | 408 | 6 | u_int32_t frame_start_offset = 0; | 409 | 18 | for (uint32_t i = 0; i < _frame_count; i++) { | 410 | 12 | uint8_t order_flag = decode_fixed8(_buffer + bit_width_offset); | 411 | 12 | uint8_t bit_width = decode_fixed8(_buffer + bit_width_offset + 1); | 412 | 12 | _bit_widths.push_back(bit_width); | 413 | 12 | _storage_formats.push_back(order_flag); | 414 | | | 415 | 12 | bit_width_offset += 2; | 416 | | | 417 | 12 | _frame_offsets.push_back(frame_start_offset); | 418 | 12 | if (sizeof(T) == 16) { | 419 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 16; | 420 | 12 | } else if (sizeof(T) == 8) { | 421 | 0 | frame_start_offset += bit_width * _max_frame_size / 8 + 8; | 422 | 12 | } else { | 423 | 12 | frame_start_offset += bit_width * _max_frame_size / 8 + 4; | 424 | 12 | } | 425 | 12 | } | 426 | | | 427 | 6 | _out_buffer.resize(_max_frame_size); | 428 | 6 | _parsed = true; | 429 | | | 430 | 6 | return true; | 431 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4initEv Unexecuted instantiation: _ZN5doris10ForDecoderIoE4initEv |
432 | | |
433 | | // todo(kks): improve this method by SIMD instructions |
434 | | |
435 | | template <typename T> |
436 | | template <typename U> |
437 | | void ForDecoder<T>::bit_unpack_optimize(const uint8_t* input, uint8_t in_num, int bit_width, |
438 | 162k | T* output) { |
439 | 162k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, |
440 | 162k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); |
441 | 162k | constexpr int u_size = sizeof(U); // Size of U |
442 | 162k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) |
443 | 162k | int valid_bit = 0; // How many valid bits |
444 | 162k | int need_bit = 0; // still need |
445 | 162k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size |
446 | 162k | int full_batch_size = |
447 | 162k | cast_set<int>((input_size >> u_size_shift) |
448 | 162k | << u_size_shift); // Adjust input_size to a multiple of u_size |
449 | 162k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. |
450 | | // The number of bits in input to adjust to multiples of 8 and thus more |
451 | 162k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); |
452 | | |
453 | | // to ensure that only bit_width bits are valid |
454 | 162k | T output_mask; |
455 | 162k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { |
456 | 0 | output_mask = static_cast<T>(~T(0)); |
457 | 162k | } else { |
458 | 162k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); |
459 | 162k | } |
460 | | |
461 | 162k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking |
462 | | |
463 | 8.96M | for (int i = 0; i < full_batch_size; i += u_size) { |
464 | 8.80M | s = 0; |
465 | | |
466 | 8.80M | s = to_endian<std::endian::big>(*((U*)(input + i))); |
467 | | |
468 | | // Determine what the valid bits are based on u_size |
469 | 8.80M | valid_bit = u_size << 3; |
470 | | |
471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. |
472 | 8.80M | if (tail_count == 0 && i == full_batch_size - u_size) { |
473 | 43.5k | valid_bit -= more_bit; |
474 | 43.5k | s >>= more_bit; |
475 | 43.5k | } |
476 | | |
477 | 8.80M | if (need_bit) { |
478 | | // The last time we take away the high bit_width - need_bit, |
479 | | // we need to make up the rest of the need_bit from the width. |
480 | | // Use valid_bit - need_bit to compute high need_bit bits of s |
481 | | // perform an AND operation to ensure that only need_bit bits are valid |
482 | 8.18M | auto mask = (static_cast<U>(1) << need_bit) - 1; |
483 | 8.18M | auto shifted = s >> (valid_bit - need_bit); |
484 | 8.18M | auto masked_result = shifted & mask; |
485 | 8.18M | if constexpr (sizeof(T) <= 4) { |
486 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); |
487 | 8.18M | } else { |
488 | 8.18M | *output |= static_cast<T>(masked_result); |
489 | 8.18M | } |
490 | 8.18M | output++; |
491 | 8.18M | valid_bit -= need_bit; |
492 | 8.18M | } |
493 | | |
494 | 8.80M | int num = valid_bit / bit_width; // How many outputs can be processed at a time |
495 | 8.80M | int remainder = valid_bit - num * bit_width; // How many bits are left to store |
496 | | |
497 | | // Starting with the highest valid bit, take out bit_width bits in sequence |
498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid |
499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end |
500 | | // But since there are still remainder bits that can't be processed, need to add the remainder |
501 | 17.0M | for (int j = 0; j < num; j++) { |
502 | 8.22M | *output = |
503 | 8.22M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); |
504 | 8.22M | output++; |
505 | 8.22M | } |
506 | | |
507 | 8.80M | if (remainder) { |
508 | | // Process the last remaining remainder bit. |
509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. |
510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit |
511 | 8.28M | if constexpr (sizeof(T) <= 4) { |
512 | 0 | auto masked_value = static_cast<T>( |
513 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); |
514 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); |
515 | 8.28M | } else { |
516 | 8.28M | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); |
517 | 8.28M | *output = static_cast<T>(masked_value << (bit_width - remainder)); |
518 | 8.28M | } |
519 | | // Already have remainder bits, next time need bit_width - remainder bits |
520 | 8.28M | need_bit = bit_width - remainder; |
521 | 8.28M | } else { |
522 | 515k | need_bit = 0; |
523 | 515k | } |
524 | 8.80M | } |
525 | | |
526 | | // remainder |
527 | 162k | if (tail_count) { |
528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit |
529 | 954k | for (int i = 0; i < tail_count; i++) { |
530 | 835k | s <<= 8; |
531 | 835k | s |= input[full_batch_size + i]; |
532 | 835k | } |
533 | | |
534 | | // tail * 8 is the number of bits that are left to process |
535 | | // tail * 8 - more_bit is to remove the last more_bit |
536 | 118k | valid_bit = (tail_count << 3) - more_bit; |
537 | 118k | s >>= more_bit; |
538 | | |
539 | | // same as before |
540 | 118k | if (need_bit) { |
541 | 108k | if constexpr (sizeof(T) <= 4) { |
542 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( |
543 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); |
544 | 108k | } else { |
545 | 108k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & |
546 | 108k | ((static_cast<U>(1) << need_bit) - 1)); |
547 | 108k | } |
548 | 108k | output++; |
549 | 108k | valid_bit -= need_bit; |
550 | 108k | } |
551 | | |
552 | 118k | int num = valid_bit / bit_width; // How many outputs can be processed at a time |
553 | | |
554 | | // same as before |
555 | 253k | for (int j = 0; j < num; j++) { |
556 | 134k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); |
557 | 134k | output++; |
558 | 134k | } |
559 | 118k | } |
560 | 162k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeIlEEvPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIaE19bit_unpack_optimizeInEEvPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeIlEEvPKhhiPs Unexecuted instantiation: _ZN5doris10ForDecoderIsE19bit_unpack_optimizeInEEvPKhhiPs _ZN5doris10ForDecoderIiE19bit_unpack_optimizeIlEEvPKhhiPi Line | Count | Source | 438 | 18 | T* output) { | 439 | 18 | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 18 | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 18 | constexpr int u_size = sizeof(U); // Size of U | 442 | 18 | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 18 | int valid_bit = 0; // How many valid bits | 444 | 18 | int need_bit = 0; // still need | 445 | 18 | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 18 | int full_batch_size = | 447 | 18 | cast_set<int>((input_size >> u_size_shift) | 448 | 18 | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 18 | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 18 | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 18 | T output_mask; | 455 | 18 | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 18 | } else { | 458 | 18 | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 18 | } | 460 | | | 461 | 18 | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 42 | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 24 | s = 0; | 465 | | | 466 | 24 | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 24 | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 24 | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 14 | valid_bit -= more_bit; | 474 | 14 | s >>= more_bit; | 475 | 14 | } | 476 | | | 477 | 24 | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 0 | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 0 | auto shifted = s >> (valid_bit - need_bit); | 484 | 0 | auto masked_result = shifted & mask; | 485 | 0 | if constexpr (sizeof(T) <= 4) { | 486 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | | } else { | 488 | | *output |= static_cast<T>(masked_result); | 489 | | } | 490 | 0 | output++; | 491 | 0 | valid_bit -= need_bit; | 492 | 0 | } | 493 | | | 494 | 24 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 24 | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 1.56k | for (int j = 0; j < num; j++) { | 502 | 1.53k | *output = | 503 | 1.53k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 1.53k | output++; | 505 | 1.53k | } | 506 | | | 507 | 24 | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | 0 | if constexpr (sizeof(T) <= 4) { | 512 | 0 | auto masked_value = static_cast<T>( | 513 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | | } else { | 516 | | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 0 | need_bit = bit_width - remainder; | 521 | 24 | } else { | 522 | 24 | need_bit = 0; | 523 | 24 | } | 524 | 24 | } | 525 | | | 526 | | // remainder | 527 | 18 | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 4 | for (int i = 0; i < tail_count; i++) { | 530 | 2 | s <<= 8; | 531 | 2 | s |= input[full_batch_size + i]; | 532 | 2 | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 2 | valid_bit = (tail_count << 3) - more_bit; | 537 | 2 | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 2 | if (need_bit) { | 541 | 0 | if constexpr (sizeof(T) <= 4) { | 542 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | | } else { | 545 | | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | | ((static_cast<U>(1) << need_bit) - 1)); | 547 | | } | 548 | 0 | output++; | 549 | 0 | valid_bit -= need_bit; | 550 | 0 | } | 551 | | | 552 | 2 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 6 | for (int j = 0; j < num; j++) { | 556 | 4 | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 4 | output++; | 558 | 4 | } | 559 | 2 | } | 560 | 18 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIiE19bit_unpack_optimizeInEEvPKhhiPi _ZN5doris10ForDecoderIlE19bit_unpack_optimizeIlEEvPKhhiPl Line | Count | Source | 438 | 24.6k | T* output) { | 439 | 24.6k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 24.6k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 24.6k | constexpr int u_size = sizeof(U); // Size of U | 442 | 24.6k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 24.6k | int valid_bit = 0; // How many valid bits | 444 | 24.6k | int need_bit = 0; // still need | 445 | 24.6k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 24.6k | int full_batch_size = | 447 | 24.6k | cast_set<int>((input_size >> u_size_shift) | 448 | 24.6k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 24.6k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 24.6k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 24.6k | T output_mask; | 455 | 24.6k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 24.6k | } else { | 458 | 24.6k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 24.6k | } | 460 | | | 461 | 24.6k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 556k | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 532k | s = 0; | 465 | | | 466 | 532k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 532k | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 532k | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 10.3k | valid_bit -= more_bit; | 474 | 10.3k | s >>= more_bit; | 475 | 10.3k | } | 476 | | | 477 | 532k | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 414k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 414k | auto shifted = s >> (valid_bit - need_bit); | 484 | 414k | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 414k | } else { | 488 | 414k | *output |= static_cast<T>(masked_result); | 489 | 414k | } | 490 | 414k | output++; | 491 | 414k | valid_bit -= need_bit; | 492 | 414k | } | 493 | | | 494 | 532k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 532k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 2.15M | for (int j = 0; j < num; j++) { | 502 | 1.61M | *output = | 503 | 1.61M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 1.61M | output++; | 505 | 1.61M | } | 506 | | | 507 | 532k | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 424k | } else { | 516 | 424k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 424k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 424k | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 424k | need_bit = bit_width - remainder; | 521 | 424k | } else { | 522 | 107k | need_bit = 0; | 523 | 107k | } | 524 | 532k | } | 525 | | | 526 | | // remainder | 527 | 24.6k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 70.0k | for (int i = 0; i < tail_count; i++) { | 530 | 56.0k | s <<= 8; | 531 | 56.0k | s |= input[full_batch_size + i]; | 532 | 56.0k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 13.9k | valid_bit = (tail_count << 3) - more_bit; | 537 | 13.9k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 13.9k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 10.2k | } else { | 545 | 10.2k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 10.2k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 10.2k | } | 548 | 10.2k | output++; | 549 | 10.2k | valid_bit -= need_bit; | 550 | 10.2k | } | 551 | | | 552 | 13.9k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 61.3k | for (int j = 0; j < num; j++) { | 556 | 47.3k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 47.3k | output++; | 558 | 47.3k | } | 559 | 13.9k | } | 560 | 24.6k | } |
_ZN5doris10ForDecoderIlE19bit_unpack_optimizeInEEvPKhhiPl Line | Count | Source | 438 | 24.3k | T* output) { | 439 | 24.3k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 24.3k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 24.3k | constexpr int u_size = sizeof(U); // Size of U | 442 | 24.3k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 24.3k | int valid_bit = 0; // How many valid bits | 444 | 24.3k | int need_bit = 0; // still need | 445 | 24.3k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 24.3k | int full_batch_size = | 447 | 24.3k | cast_set<int>((input_size >> u_size_shift) | 448 | 24.3k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 24.3k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 24.3k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 24.3k | T output_mask; | 455 | 24.3k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 24.3k | } else { | 458 | 24.3k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 24.3k | } | 460 | | | 461 | 24.3k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 807k | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 783k | s = 0; | 465 | | | 466 | 783k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 783k | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 783k | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 9.10k | valid_bit -= more_bit; | 474 | 9.10k | s >>= more_bit; | 475 | 9.10k | } | 476 | | | 477 | 783k | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 735k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 735k | auto shifted = s >> (valid_bit - need_bit); | 484 | 735k | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 735k | } else { | 488 | 735k | *output |= static_cast<T>(masked_result); | 489 | 735k | } | 490 | 735k | output++; | 491 | 735k | valid_bit -= need_bit; | 492 | 735k | } | 493 | | | 494 | 783k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 783k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 2.10M | for (int j = 0; j < num; j++) { | 502 | 1.32M | *output = | 503 | 1.32M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 1.32M | output++; | 505 | 1.32M | } | 506 | | | 507 | 783k | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 749k | } else { | 516 | 749k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 749k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 749k | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 749k | need_bit = bit_width - remainder; | 521 | 749k | } else { | 522 | 33.9k | need_bit = 0; | 523 | 33.9k | } | 524 | 783k | } | 525 | | | 526 | | // remainder | 527 | 24.3k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 137k | for (int i = 0; i < tail_count; i++) { | 530 | 121k | s <<= 8; | 531 | 121k | s |= input[full_batch_size + i]; | 532 | 121k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 15.2k | valid_bit = (tail_count << 3) - more_bit; | 537 | 15.2k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 15.2k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 14.6k | } else { | 545 | 14.6k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 14.6k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 14.6k | } | 548 | 14.6k | output++; | 549 | 14.6k | valid_bit -= need_bit; | 550 | 14.6k | } | 551 | | | 552 | 15.2k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 28.2k | for (int j = 0; j < num; j++) { | 556 | 13.0k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 13.0k | output++; | 558 | 13.0k | } | 559 | 15.2k | } | 560 | 24.3k | } |
_ZN5doris10ForDecoderInE19bit_unpack_optimizeIlEEvPKhhiPn Line | Count | Source | 438 | 16.5k | T* output) { | 439 | 16.5k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 16.5k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 16.5k | constexpr int u_size = sizeof(U); // Size of U | 442 | 16.5k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 16.5k | int valid_bit = 0; // How many valid bits | 444 | 16.5k | int need_bit = 0; // still need | 445 | 16.5k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 16.5k | int full_batch_size = | 447 | 16.5k | cast_set<int>((input_size >> u_size_shift) | 448 | 16.5k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 16.5k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 16.5k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 16.5k | T output_mask; | 455 | 16.5k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 16.5k | } else { | 458 | 16.5k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 16.5k | } | 460 | | | 461 | 16.5k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 548k | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 532k | s = 0; | 465 | | | 466 | 532k | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 532k | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 532k | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 2.24k | valid_bit -= more_bit; | 474 | 2.24k | s >>= more_bit; | 475 | 2.24k | } | 476 | | | 477 | 532k | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 414k | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 414k | auto shifted = s >> (valid_bit - need_bit); | 484 | 414k | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 414k | } else { | 488 | 414k | *output |= static_cast<T>(masked_result); | 489 | 414k | } | 490 | 414k | output++; | 491 | 414k | valid_bit -= need_bit; | 492 | 414k | } | 493 | | | 494 | 532k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 532k | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 2.14M | for (int j = 0; j < num; j++) { | 502 | 1.61M | *output = | 503 | 1.61M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 1.61M | output++; | 505 | 1.61M | } | 506 | | | 507 | 532k | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 424k | } else { | 516 | 424k | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 424k | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 424k | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 424k | need_bit = bit_width - remainder; | 521 | 424k | } else { | 522 | 107k | need_bit = 0; | 523 | 107k | } | 524 | 532k | } | 525 | | | 526 | | // remainder | 527 | 16.5k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 70.4k | for (int i = 0; i < tail_count; i++) { | 530 | 56.3k | s <<= 8; | 531 | 56.3k | s |= input[full_batch_size + i]; | 532 | 56.3k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 14.0k | valid_bit = (tail_count << 3) - more_bit; | 537 | 14.0k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 14.0k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 10.2k | } else { | 545 | 10.2k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 10.2k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 10.2k | } | 548 | 10.2k | output++; | 549 | 10.2k | valid_bit -= need_bit; | 550 | 10.2k | } | 551 | | | 552 | 14.0k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 61.5k | for (int j = 0; j < num; j++) { | 556 | 47.4k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 47.4k | output++; | 558 | 47.4k | } | 559 | 14.0k | } | 560 | 16.5k | } |
_ZN5doris10ForDecoderInE19bit_unpack_optimizeInEEvPKhhiPn Line | Count | Source | 438 | 97.0k | T* output) { | 439 | 97.0k | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 97.0k | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 97.0k | constexpr int u_size = sizeof(U); // Size of U | 442 | 97.0k | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 97.0k | int valid_bit = 0; // How many valid bits | 444 | 97.0k | int need_bit = 0; // still need | 445 | 97.0k | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 97.0k | int full_batch_size = | 447 | 97.0k | cast_set<int>((input_size >> u_size_shift) | 448 | 97.0k | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 97.0k | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 97.0k | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 97.0k | T output_mask; | 455 | 97.0k | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 97.0k | } else { | 458 | 97.0k | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 97.0k | } | 460 | | | 461 | 97.0k | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 7.05M | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 6.95M | s = 0; | 465 | | | 466 | 6.95M | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 6.95M | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 6.95M | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 21.7k | valid_bit -= more_bit; | 474 | 21.7k | s >>= more_bit; | 475 | 21.7k | } | 476 | | | 477 | 6.95M | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 6.61M | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 6.61M | auto shifted = s >> (valid_bit - need_bit); | 484 | 6.61M | auto masked_result = shifted & mask; | 485 | | if constexpr (sizeof(T) <= 4) { | 486 | | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | 6.61M | } else { | 488 | 6.61M | *output |= static_cast<T>(masked_result); | 489 | 6.61M | } | 490 | 6.61M | output++; | 491 | 6.61M | valid_bit -= need_bit; | 492 | 6.61M | } | 493 | | | 494 | 6.95M | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 6.95M | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 10.6M | for (int j = 0; j < num; j++) { | 502 | 3.66M | *output = | 503 | 3.66M | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 3.66M | output++; | 505 | 3.66M | } | 506 | | | 507 | 6.95M | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | | if constexpr (sizeof(T) <= 4) { | 512 | | auto masked_value = static_cast<T>( | 513 | | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | 6.69M | } else { | 516 | 6.69M | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | 6.69M | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | 6.69M | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 6.69M | need_bit = bit_width - remainder; | 521 | 6.69M | } else { | 522 | 265k | need_bit = 0; | 523 | 265k | } | 524 | 6.95M | } | 525 | | | 526 | | // remainder | 527 | 97.0k | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 676k | for (int i = 0; i < tail_count; i++) { | 530 | 601k | s <<= 8; | 531 | 601k | s |= input[full_batch_size + i]; | 532 | 601k | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 75.2k | valid_bit = (tail_count << 3) - more_bit; | 537 | 75.2k | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 75.2k | if (need_bit) { | 541 | | if constexpr (sizeof(T) <= 4) { | 542 | | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | 72.8k | } else { | 545 | 72.8k | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | 72.8k | ((static_cast<U>(1) << need_bit) - 1)); | 547 | 72.8k | } | 548 | 72.8k | output++; | 549 | 72.8k | valid_bit -= need_bit; | 550 | 72.8k | } | 551 | | | 552 | 75.2k | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 101k | for (int j = 0; j < num; j++) { | 556 | 26.6k | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 26.6k | output++; | 558 | 26.6k | } | 559 | 75.2k | } | 560 | 97.0k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeIlEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderIhE19bit_unpack_optimizeInEEvPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeIlEEvPKhhiPt Unexecuted instantiation: _ZN5doris10ForDecoderItE19bit_unpack_optimizeInEEvPKhhiPt _ZN5doris10ForDecoderIjE19bit_unpack_optimizeIlEEvPKhhiPj Line | Count | Source | 438 | 10 | T* output) { | 439 | 10 | static_assert(std::is_same<U, int64_t>::value || std::is_same<U, __int128_t>::value, | 440 | 10 | "bit_unpack_optimize only supports U = int64_t or __int128_t"); | 441 | 10 | constexpr int u_size = sizeof(U); // Size of U | 442 | 10 | constexpr int u_size_shift = (u_size == 8) ? 3 : 4; // log2(u_size) | 443 | 10 | int valid_bit = 0; // How many valid bits | 444 | 10 | int need_bit = 0; // still need | 445 | 10 | size_t input_size = (in_num * bit_width + 7) >> 3; // input's size | 446 | 10 | int full_batch_size = | 447 | 10 | cast_set<int>((input_size >> u_size_shift) | 448 | 10 | << u_size_shift); // Adjust input_size to a multiple of u_size | 449 | 10 | int tail_count = input_size & (u_size - 1); // The remainder of input_size modulo u_size. | 450 | | // The number of bits in input to adjust to multiples of 8 and thus more | 451 | 10 | int more_bit = cast_set<int>((input_size << 3) - (in_num * bit_width)); | 452 | | | 453 | | // to ensure that only bit_width bits are valid | 454 | 10 | T output_mask; | 455 | 10 | if (bit_width >= static_cast<int>(sizeof(T) * 8)) { | 456 | 0 | output_mask = static_cast<T>(~T(0)); | 457 | 10 | } else { | 458 | 10 | output_mask = static_cast<T>((static_cast<T>(1) << bit_width) - 1); | 459 | 10 | } | 460 | | | 461 | 10 | U s = 0; // Temporary buffer for bitstream: aggregates input bytes into a large integer for unpacking | 462 | | | 463 | 30 | for (int i = 0; i < full_batch_size; i += u_size) { | 464 | 20 | s = 0; | 465 | | | 466 | 20 | s = to_endian<std::endian::big>(*((U*)(input + i))); | 467 | | | 468 | | // Determine what the valid bits are based on u_size | 469 | 20 | valid_bit = u_size << 3; | 470 | | | 471 | | // If input_size is exactly a multiple of 8, then need to remove the last more_bit in the last loop. | 472 | 20 | if (tail_count == 0 && i == full_batch_size - u_size) { | 473 | 10 | valid_bit -= more_bit; | 474 | 10 | s >>= more_bit; | 475 | 10 | } | 476 | | | 477 | 20 | if (need_bit) { | 478 | | // The last time we take away the high bit_width - need_bit, | 479 | | // we need to make up the rest of the need_bit from the width. | 480 | | // Use valid_bit - need_bit to compute high need_bit bits of s | 481 | | // perform an AND operation to ensure that only need_bit bits are valid | 482 | 0 | auto mask = (static_cast<U>(1) << need_bit) - 1; | 483 | 0 | auto shifted = s >> (valid_bit - need_bit); | 484 | 0 | auto masked_result = shifted & mask; | 485 | 0 | if constexpr (sizeof(T) <= 4) { | 486 | 0 | *output |= static_cast<T>(static_cast<uint32_t>(masked_result)); | 487 | | } else { | 488 | | *output |= static_cast<T>(masked_result); | 489 | | } | 490 | 0 | output++; | 491 | 0 | valid_bit -= need_bit; | 492 | 0 | } | 493 | | | 494 | 20 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 495 | 20 | int remainder = valid_bit - num * bit_width; // How many bits are left to store | 496 | | | 497 | | // Starting with the highest valid bit, take out bit_width bits in sequence | 498 | | // perform an AND operation with output_mask to ensure that only bit_width bits are valid | 499 | | // (num-j-1) * bit_width used to calculate how many bits need to be removed at the end | 500 | | // But since there are still remainder bits that can't be processed, need to add the remainder | 501 | 1.30k | for (int j = 0; j < num; j++) { | 502 | 1.28k | *output = | 503 | 1.28k | static_cast<T>((s >> (((num - j - 1) * bit_width) + remainder)) & output_mask); | 504 | 1.28k | output++; | 505 | 1.28k | } | 506 | | | 507 | 20 | if (remainder) { | 508 | | // Process the last remaining remainder bit. | 509 | | // y = (s & ((static_cast<U>(1) << remainder) - 1)) extract the last remainder bits. | 510 | | // output = y << (bit_width - remainder) Use the high bit_width - remainder bit | 511 | 0 | if constexpr (sizeof(T) <= 4) { | 512 | 0 | auto masked_value = static_cast<T>( | 513 | 0 | static_cast<uint32_t>(s & ((static_cast<U>(1) << remainder) - 1))); | 514 | 0 | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 515 | | } else { | 516 | | auto masked_value = static_cast<T>((s & ((static_cast<U>(1) << remainder) - 1))); | 517 | | *output = static_cast<T>(masked_value << (bit_width - remainder)); | 518 | | } | 519 | | // Already have remainder bits, next time need bit_width - remainder bits | 520 | 0 | need_bit = bit_width - remainder; | 521 | 20 | } else { | 522 | 20 | need_bit = 0; | 523 | 20 | } | 524 | 20 | } | 525 | | | 526 | | // remainder | 527 | 10 | if (tail_count) { | 528 | | // Put the tail_count numbers in the input into s in order, each number occupies 8 bit | 529 | 0 | for (int i = 0; i < tail_count; i++) { | 530 | 0 | s <<= 8; | 531 | 0 | s |= input[full_batch_size + i]; | 532 | 0 | } | 533 | | | 534 | | // tail * 8 is the number of bits that are left to process | 535 | | // tail * 8 - more_bit is to remove the last more_bit | 536 | 0 | valid_bit = (tail_count << 3) - more_bit; | 537 | 0 | s >>= more_bit; | 538 | | | 539 | | // same as before | 540 | 0 | if (need_bit) { | 541 | 0 | if constexpr (sizeof(T) <= 4) { | 542 | 0 | *output |= static_cast<T>(static_cast<uint32_t>( | 543 | 0 | (s >> (valid_bit - need_bit)) & ((static_cast<U>(1) << need_bit) - 1))); | 544 | | } else { | 545 | | *output |= static_cast<T>((s >> (valid_bit - need_bit)) & | 546 | | ((static_cast<U>(1) << need_bit) - 1)); | 547 | | } | 548 | 0 | output++; | 549 | 0 | valid_bit -= need_bit; | 550 | 0 | } | 551 | |
| 552 | 0 | int num = valid_bit / bit_width; // How many outputs can be processed at a time | 553 | | | 554 | | // same as before | 555 | 0 | for (int j = 0; j < num; j++) { | 556 | 0 | *output = static_cast<T>((s >> (((num - j - 1) * bit_width))) & output_mask); | 557 | 0 | output++; | 558 | 0 | } | 559 | 0 | } | 560 | 10 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIjE19bit_unpack_optimizeInEEvPKhhiPj Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeIlEEvPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderImE19bit_unpack_optimizeInEEvPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeIlEEvPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE19bit_unpack_optimizeInEEvPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeIlEEvPKhhiPo Unexecuted instantiation: _ZN5doris10ForDecoderIoE19bit_unpack_optimizeInEEvPKhhiPo |
561 | | |
562 | | // The reverse of bit_pack method, get original integer data list from packed bits |
563 | | // param[in] input: the packed bits need to unpack |
564 | | // param[in] in_num: the integer number in packed bits |
565 | | // param[in] bit_width: how many bit we used to store each integer data |
566 | | // param[out] output: the original integer data list |
567 | | template <typename T> |
568 | 162k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { |
569 | | /* |
570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. |
571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. |
572 | | */ |
573 | 162k | if (bit_width <= 32) { |
574 | 41.2k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); |
575 | 121k | } else { |
576 | 121k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); |
577 | 121k | } |
578 | 162k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE10bit_unpackEPKhhiPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE10bit_unpackEPKhhiPs _ZN5doris10ForDecoderIiE10bit_unpackEPKhhiPi Line | Count | Source | 568 | 18 | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 18 | if (bit_width <= 32) { | 574 | 18 | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 18 | } else { | 576 | 0 | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 0 | } | 578 | 18 | } |
_ZN5doris10ForDecoderIlE10bit_unpackEPKhhiPl Line | Count | Source | 568 | 48.9k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 48.9k | if (bit_width <= 32) { | 574 | 24.6k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 24.6k | } else { | 576 | 24.3k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 24.3k | } | 578 | 48.9k | } |
_ZN5doris10ForDecoderInE10bit_unpackEPKhhiPn Line | Count | Source | 568 | 113k | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 113k | if (bit_width <= 32) { | 574 | 16.5k | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 97.0k | } else { | 576 | 97.0k | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 97.0k | } | 578 | 113k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10bit_unpackEPKhhiPh Unexecuted instantiation: _ZN5doris10ForDecoderItE10bit_unpackEPKhhiPt _ZN5doris10ForDecoderIjE10bit_unpackEPKhhiPj Line | Count | Source | 568 | 10 | void ForDecoder<T>::bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output) { | 569 | | /* | 570 | | When 32 < bit_width <= 64 unrolling the loop 16 times is more efficient than unrolling it 8 times. | 571 | | When bit_width > 64, we must use __int128_t and unroll the loop 16 times. | 572 | | */ | 573 | 10 | if (bit_width <= 32) { | 574 | 10 | bit_unpack_optimize<int64_t>(input, in_num, bit_width, output); | 575 | 10 | } else { | 576 | 0 | bit_unpack_optimize<__int128_t>(input, in_num, bit_width, output); | 577 | 0 | } | 578 | 10 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE10bit_unpackEPKhhiPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10bit_unpackEPKhhiPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE10bit_unpackEPKhhiPo |
579 | | |
580 | | template <typename T> |
581 | 8.35M | void ForDecoder<T>::decode_current_frame(T* output) { |
582 | 8.35M | uint32_t frame_index = _current_index / _max_frame_size; |
583 | 8.35M | if (frame_index == _current_decoded_frame) { |
584 | 8.25M | return; // current frame already decoded |
585 | 8.25M | } |
586 | 97.8k | _current_decoded_frame = frame_index; |
587 | 97.8k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); |
588 | | |
589 | 97.8k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; |
590 | 97.8k | T min = 0; |
591 | 97.8k | uint32_t delta_offset = 0; |
592 | 97.8k | if constexpr (sizeof(T) == 16) { |
593 | 48.8k | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); |
594 | 48.8k | delta_offset = base_offset + 16; |
595 | 48.9k | } else if constexpr (sizeof(T) == 8) { |
596 | 48.9k | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); |
597 | 48.9k | delta_offset = base_offset + 8; |
598 | 48.9k | } else { |
599 | 28 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); |
600 | 28 | delta_offset = base_offset + 4; |
601 | 28 | } |
602 | | |
603 | 97.8k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; |
604 | | |
605 | 97.8k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; |
606 | 97.8k | if (is_original_value) { |
607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); |
608 | 97.8k | } else { |
609 | 97.8k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; |
610 | 97.8k | std::vector<T> delta_values(current_frame_size); |
611 | 97.8k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); |
612 | 97.8k | if (is_ascending) { |
613 | 902 | T pre_value = min; |
614 | 7.74k | for (uint8_t i = 0; i < current_frame_size; i++) { |
615 | 6.84k | T value = delta_values[i] + pre_value; |
616 | 6.84k | output[i] = value; |
617 | 6.84k | pre_value = value; |
618 | 6.84k | } |
619 | 96.9k | } else { |
620 | 8.45M | for (uint8_t i = 0; i < current_frame_size; i++) { |
621 | 8.35M | output[i] = delta_values[i] + min; |
622 | 8.35M | } |
623 | 96.9k | } |
624 | 97.8k | } |
625 | 97.8k | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE20decode_current_frameEPa Unexecuted instantiation: _ZN5doris10ForDecoderIsE20decode_current_frameEPs _ZN5doris10ForDecoderIiE20decode_current_frameEPi Line | Count | Source | 581 | 20 | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 20 | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 20 | if (frame_index == _current_decoded_frame) { | 584 | 2 | return; // current frame already decoded | 585 | 2 | } | 586 | 18 | _current_decoded_frame = frame_index; | 587 | 18 | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 18 | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 18 | T min = 0; | 591 | 18 | uint32_t delta_offset = 0; | 592 | | if constexpr (sizeof(T) == 16) { | 593 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | | delta_offset = base_offset + 16; | 595 | | } else if constexpr (sizeof(T) == 8) { | 596 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | | delta_offset = base_offset + 8; | 598 | 18 | } else { | 599 | 18 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | 18 | delta_offset = base_offset + 4; | 601 | 18 | } | 602 | | | 603 | 18 | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 18 | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 18 | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 18 | } else { | 609 | 18 | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 18 | std::vector<T> delta_values(current_frame_size); | 611 | 18 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 18 | if (is_ascending) { | 613 | 18 | T pre_value = min; | 614 | 1.56k | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 1.54k | T value = delta_values[i] + pre_value; | 616 | 1.54k | output[i] = value; | 617 | 1.54k | pre_value = value; | 618 | 1.54k | } | 619 | 18 | } else { | 620 | 0 | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 0 | output[i] = delta_values[i] + min; | 622 | 0 | } | 623 | 0 | } | 624 | 18 | } | 625 | 18 | } |
_ZN5doris10ForDecoderIlE20decode_current_frameEPl Line | Count | Source | 581 | 4.17M | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 4.17M | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 4.17M | if (frame_index == _current_decoded_frame) { | 584 | 4.12M | return; // current frame already decoded | 585 | 4.12M | } | 586 | 48.9k | _current_decoded_frame = frame_index; | 587 | 48.9k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 48.9k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 48.9k | T min = 0; | 591 | 48.9k | uint32_t delta_offset = 0; | 592 | | if constexpr (sizeof(T) == 16) { | 593 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | | delta_offset = base_offset + 16; | 595 | 48.9k | } else if constexpr (sizeof(T) == 8) { | 596 | 48.9k | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | 48.9k | delta_offset = base_offset + 8; | 598 | | } else { | 599 | | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | | delta_offset = base_offset + 4; | 601 | | } | 602 | | | 603 | 48.9k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 48.9k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 48.9k | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 48.9k | } else { | 609 | 48.9k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 48.9k | std::vector<T> delta_values(current_frame_size); | 611 | 48.9k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 48.9k | if (is_ascending) { | 613 | 446 | T pre_value = min; | 614 | 3.79k | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 3.34k | T value = delta_values[i] + pre_value; | 616 | 3.34k | output[i] = value; | 617 | 3.34k | pre_value = value; | 618 | 3.34k | } | 619 | 48.4k | } else { | 620 | 4.22M | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 4.17M | output[i] = delta_values[i] + min; | 622 | 4.17M | } | 623 | 48.4k | } | 624 | 48.9k | } | 625 | 48.9k | } |
_ZN5doris10ForDecoderInE20decode_current_frameEPn Line | Count | Source | 581 | 4.17M | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 4.17M | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 4.17M | if (frame_index == _current_decoded_frame) { | 584 | 4.12M | return; // current frame already decoded | 585 | 4.12M | } | 586 | 48.8k | _current_decoded_frame = frame_index; | 587 | 48.8k | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 48.8k | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 48.8k | T min = 0; | 591 | 48.8k | uint32_t delta_offset = 0; | 592 | 48.8k | if constexpr (sizeof(T) == 16) { | 593 | 48.8k | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | 48.8k | delta_offset = base_offset + 16; | 595 | | } else if constexpr (sizeof(T) == 8) { | 596 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | | delta_offset = base_offset + 8; | 598 | | } else { | 599 | | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | | delta_offset = base_offset + 4; | 601 | | } | 602 | | | 603 | 48.8k | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 48.8k | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 48.8k | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 48.8k | } else { | 609 | 48.8k | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 48.8k | std::vector<T> delta_values(current_frame_size); | 611 | 48.8k | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 48.8k | if (is_ascending) { | 613 | 428 | T pre_value = min; | 614 | 1.10k | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 676 | T value = delta_values[i] + pre_value; | 616 | 676 | output[i] = value; | 617 | 676 | pre_value = value; | 618 | 676 | } | 619 | 48.4k | } else { | 620 | 4.22M | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 4.17M | output[i] = delta_values[i] + min; | 622 | 4.17M | } | 623 | 48.4k | } | 624 | 48.8k | } | 625 | 48.8k | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE20decode_current_frameEPh Unexecuted instantiation: _ZN5doris10ForDecoderItE20decode_current_frameEPt _ZN5doris10ForDecoderIjE20decode_current_frameEPj Line | Count | Source | 581 | 10 | void ForDecoder<T>::decode_current_frame(T* output) { | 582 | 10 | uint32_t frame_index = _current_index / _max_frame_size; | 583 | 10 | if (frame_index == _current_decoded_frame) { | 584 | 0 | return; // current frame already decoded | 585 | 0 | } | 586 | 10 | _current_decoded_frame = frame_index; | 587 | 10 | uint8_t current_frame_size = cast_set<uint8_t>(frame_size(frame_index)); | 588 | | | 589 | 10 | uint32_t base_offset = _frame_offsets[_current_decoded_frame]; | 590 | 10 | T min = 0; | 591 | 10 | uint32_t delta_offset = 0; | 592 | | if constexpr (sizeof(T) == 16) { | 593 | | min = static_cast<T>(decode_fixed128_le(_buffer + base_offset)); | 594 | | delta_offset = base_offset + 16; | 595 | | } else if constexpr (sizeof(T) == 8) { | 596 | | min = static_cast<T>(decode_fixed64_le(_buffer + base_offset)); | 597 | | delta_offset = base_offset + 8; | 598 | 10 | } else { | 599 | 10 | min = static_cast<T>(decode_fixed32_le(_buffer + base_offset)); | 600 | 10 | delta_offset = base_offset + 4; | 601 | 10 | } | 602 | | | 603 | 10 | uint8_t bit_width = _bit_widths[_current_decoded_frame]; | 604 | | | 605 | 10 | bool is_original_value = _storage_formats[_current_decoded_frame] == 2; | 606 | 10 | if (is_original_value) { | 607 | 0 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, output); | 608 | 10 | } else { | 609 | 10 | bool is_ascending = _storage_formats[_current_decoded_frame] == 1; | 610 | 10 | std::vector<T> delta_values(current_frame_size); | 611 | 10 | bit_unpack(_buffer + delta_offset, current_frame_size, bit_width, delta_values.data()); | 612 | 10 | if (is_ascending) { | 613 | 10 | T pre_value = min; | 614 | 1.29k | for (uint8_t i = 0; i < current_frame_size; i++) { | 615 | 1.28k | T value = delta_values[i] + pre_value; | 616 | 1.28k | output[i] = value; | 617 | 1.28k | pre_value = value; | 618 | 1.28k | } | 619 | 10 | } else { | 620 | 0 | for (uint8_t i = 0; i < current_frame_size; i++) { | 621 | 0 | output[i] = delta_values[i] + min; | 622 | 0 | } | 623 | 0 | } | 624 | 10 | } | 625 | 10 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE20decode_current_frameEPm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE20decode_current_frameEPS1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE20decode_current_frameEPo |
626 | | |
627 | | template <typename T> |
628 | 24 | T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) { |
629 | 24 | uint32_t min_offset = _frame_offsets[frame_index]; |
630 | 24 | T min = 0; |
631 | 24 | if constexpr (sizeof(T) == 16) { |
632 | 0 | min = static_cast<T>(decode_fixed128_le(_buffer + min_offset)); |
633 | 24 | } else if constexpr (sizeof(T) == 8) { |
634 | 24 | min = static_cast<T>(decode_fixed64_le(_buffer + min_offset)); |
635 | 24 | } else { |
636 | 0 | min = static_cast<T>(decode_fixed32_le(_buffer + min_offset)); |
637 | 0 | } |
638 | 24 | return min; |
639 | 24 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIsE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIiE22decode_frame_min_valueEj _ZN5doris10ForDecoderIlE22decode_frame_min_valueEj Line | Count | Source | 628 | 24 | T ForDecoder<T>::decode_frame_min_value(uint32_t frame_index) { | 629 | 24 | uint32_t min_offset = _frame_offsets[frame_index]; | 630 | 24 | T min = 0; | 631 | | if constexpr (sizeof(T) == 16) { | 632 | | min = static_cast<T>(decode_fixed128_le(_buffer + min_offset)); | 633 | 24 | } else if constexpr (sizeof(T) == 8) { | 634 | 24 | min = static_cast<T>(decode_fixed64_le(_buffer + min_offset)); | 635 | | } else { | 636 | | min = static_cast<T>(decode_fixed32_le(_buffer + min_offset)); | 637 | | } | 638 | 24 | return min; | 639 | 24 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIhE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderItE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIjE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderImE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22decode_frame_min_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderIoE22decode_frame_min_valueEj |
640 | | |
641 | | template <typename T> |
642 | 8.35M | T* ForDecoder<T>::copy_value(T* val, size_t count) { |
643 | 8.35M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); |
644 | 8.35M | _current_index += count; |
645 | 8.35M | val += count; |
646 | 8.35M | return val; |
647 | 8.35M | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE10copy_valueEPam Unexecuted instantiation: _ZN5doris10ForDecoderIsE10copy_valueEPsm _ZN5doris10ForDecoderIiE10copy_valueEPim Line | Count | Source | 642 | 16 | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 16 | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 16 | _current_index += count; | 645 | 16 | val += count; | 646 | 16 | return val; | 647 | 16 | } |
_ZN5doris10ForDecoderIlE10copy_valueEPlm Line | Count | Source | 642 | 4.17M | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 4.17M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 4.17M | _current_index += count; | 645 | 4.17M | val += count; | 646 | 4.17M | return val; | 647 | 4.17M | } |
_ZN5doris10ForDecoderInE10copy_valueEPnm Line | Count | Source | 642 | 4.17M | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 4.17M | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 4.17M | _current_index += count; | 645 | 4.17M | val += count; | 646 | 4.17M | return val; | 647 | 4.17M | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE10copy_valueEPhm Unexecuted instantiation: _ZN5doris10ForDecoderItE10copy_valueEPtm _ZN5doris10ForDecoderIjE10copy_valueEPjm Line | Count | Source | 642 | 6 | T* ForDecoder<T>::copy_value(T* val, size_t count) { | 643 | 6 | memcpy(val, &_out_buffer[_current_index % _max_frame_size], sizeof(T) * count); | 644 | 6 | _current_index += count; | 645 | 6 | val += count; | 646 | 6 | return val; | 647 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE10copy_valueEPmm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE10copy_valueEPS1_m Unexecuted instantiation: _ZN5doris10ForDecoderIoE10copy_valueEPom |
648 | | |
649 | | template <typename T> |
650 | 8.35M | bool ForDecoder<T>::get_batch(T* val, size_t count) { |
651 | 8.35M | if (_current_index + count > _values_num) { |
652 | 2 | return false; |
653 | 2 | } |
654 | | |
655 | 8.35M | decode_current_frame(_out_buffer.data()); |
656 | | |
657 | 8.35M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { |
658 | 8.32M | copy_value(val, count); |
659 | 8.32M | return true; |
660 | 8.32M | } |
661 | | |
662 | | // 1. padding one frame |
663 | 32.7k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; |
664 | 32.7k | val = copy_value(val, padding_num); |
665 | | |
666 | | // 2. process frame by frame |
667 | 32.7k | size_t frame_count = (count - padding_num) / _max_frame_size; |
668 | 32.8k | for (size_t i = 0; i < frame_count; i++) { |
669 | | // directly decode value to the output, don't buffer the value |
670 | 14 | decode_current_frame(val); |
671 | 14 | _current_index += _max_frame_size; |
672 | 14 | val += _max_frame_size; |
673 | 14 | } |
674 | | |
675 | | // 3. process remaining value |
676 | 32.7k | size_t remaining_num = (count - padding_num) % _max_frame_size; |
677 | 32.7k | if (remaining_num > 0) { |
678 | 8 | decode_current_frame(_out_buffer.data()); |
679 | 8 | val = copy_value(val, remaining_num); |
680 | 8 | } |
681 | | |
682 | 32.7k | return true; |
683 | 8.35M | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE9get_batchEPam Unexecuted instantiation: _ZN5doris10ForDecoderIsE9get_batchEPsm _ZN5doris10ForDecoderIiE9get_batchEPim Line | Count | Source | 650 | 16 | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 16 | if (_current_index + count > _values_num) { | 652 | 2 | return false; | 653 | 2 | } | 654 | | | 655 | 14 | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 14 | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 8 | copy_value(val, count); | 659 | 8 | return true; | 660 | 8 | } | 661 | | | 662 | | // 1. padding one frame | 663 | 6 | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 6 | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 6 | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 10 | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 4 | decode_current_frame(val); | 671 | 4 | _current_index += _max_frame_size; | 672 | 4 | val += _max_frame_size; | 673 | 4 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 6 | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 6 | if (remaining_num > 0) { | 678 | 2 | decode_current_frame(_out_buffer.data()); | 679 | 2 | val = copy_value(val, remaining_num); | 680 | 2 | } | 681 | | | 682 | 6 | return true; | 683 | 14 | } |
_ZN5doris10ForDecoderIlE9get_batchEPlm Line | Count | Source | 650 | 4.17M | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 4.17M | if (_current_index + count > _values_num) { | 652 | 0 | return false; | 653 | 0 | } | 654 | | | 655 | 4.17M | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 4.17M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 4.16M | copy_value(val, count); | 659 | 4.16M | return true; | 660 | 4.16M | } | 661 | | | 662 | | // 1. padding one frame | 663 | 16.3k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 16.3k | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 16.3k | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 16.3k | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 6 | decode_current_frame(val); | 671 | 6 | _current_index += _max_frame_size; | 672 | 6 | val += _max_frame_size; | 673 | 6 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 16.3k | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 16.3k | if (remaining_num > 0) { | 678 | 6 | decode_current_frame(_out_buffer.data()); | 679 | 6 | val = copy_value(val, remaining_num); | 680 | 6 | } | 681 | | | 682 | 16.3k | return true; | 683 | 4.17M | } |
_ZN5doris10ForDecoderInE9get_batchEPnm Line | Count | Source | 650 | 4.17M | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 4.17M | if (_current_index + count > _values_num) { | 652 | 0 | return false; | 653 | 0 | } | 654 | | | 655 | 4.17M | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 4.17M | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 4.16M | copy_value(val, count); | 659 | 4.16M | return true; | 660 | 4.16M | } | 661 | | | 662 | | // 1. padding one frame | 663 | 16.3k | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 16.3k | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 16.3k | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 16.3k | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 0 | decode_current_frame(val); | 671 | 0 | _current_index += _max_frame_size; | 672 | 0 | val += _max_frame_size; | 673 | 0 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 16.3k | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 16.3k | if (remaining_num > 0) { | 678 | 0 | decode_current_frame(_out_buffer.data()); | 679 | 0 | val = copy_value(val, remaining_num); | 680 | 0 | } | 681 | | | 682 | 16.3k | return true; | 683 | 4.17M | } |
Unexecuted instantiation: _ZN5doris10ForDecoderIhE9get_batchEPhm Unexecuted instantiation: _ZN5doris10ForDecoderItE9get_batchEPtm _ZN5doris10ForDecoderIjE9get_batchEPjm Line | Count | Source | 650 | 6 | bool ForDecoder<T>::get_batch(T* val, size_t count) { | 651 | 6 | if (_current_index + count > _values_num) { | 652 | 0 | return false; | 653 | 0 | } | 654 | | | 655 | 6 | decode_current_frame(_out_buffer.data()); | 656 | | | 657 | 6 | if (_current_index + count < _max_frame_size * (_current_decoded_frame + 1)) { | 658 | 0 | copy_value(val, count); | 659 | 0 | return true; | 660 | 0 | } | 661 | | | 662 | | // 1. padding one frame | 663 | 6 | size_t padding_num = _max_frame_size * (_current_decoded_frame + 1) - _current_index; | 664 | 6 | val = copy_value(val, padding_num); | 665 | | | 666 | | // 2. process frame by frame | 667 | 6 | size_t frame_count = (count - padding_num) / _max_frame_size; | 668 | 10 | for (size_t i = 0; i < frame_count; i++) { | 669 | | // directly decode value to the output, don't buffer the value | 670 | 4 | decode_current_frame(val); | 671 | 4 | _current_index += _max_frame_size; | 672 | 4 | val += _max_frame_size; | 673 | 4 | } | 674 | | | 675 | | // 3. process remaining value | 676 | 6 | size_t remaining_num = (count - padding_num) % _max_frame_size; | 677 | 6 | if (remaining_num > 0) { | 678 | 0 | decode_current_frame(_out_buffer.data()); | 679 | 0 | val = copy_value(val, remaining_num); | 680 | 0 | } | 681 | | | 682 | 6 | return true; | 683 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE9get_batchEPmm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE9get_batchEPS1_m Unexecuted instantiation: _ZN5doris10ForDecoderIoE9get_batchEPom |
684 | | |
685 | | template <typename T> |
686 | 6 | bool ForDecoder<T>::skip(int32_t skip_num) { |
687 | 6 | if (_current_index + skip_num >= _values_num) { |
688 | 0 | return false; |
689 | 0 | } |
690 | 6 | _current_index = _current_index + skip_num; |
691 | 6 | return true; |
692 | 6 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIsE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIiE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIlE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderInE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIhE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderItE4skipEi _ZN5doris10ForDecoderIjE4skipEi Line | Count | Source | 686 | 6 | bool ForDecoder<T>::skip(int32_t skip_num) { | 687 | 6 | if (_current_index + skip_num >= _values_num) { | 688 | 0 | return false; | 689 | 0 | } | 690 | 6 | _current_index = _current_index + skip_num; | 691 | 6 | return true; | 692 | 6 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderImE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE4skipEi Unexecuted instantiation: _ZN5doris10ForDecoderIoE4skipEi |
693 | | |
694 | | template <typename T> |
695 | 12 | uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) { |
696 | | // first of all, find the first frame >= target |
697 | 12 | uint32_t left = 0; |
698 | 12 | uint32_t right = _frame_count; |
699 | 36 | while (left < right) { |
700 | 24 | uint32_t mid = left + (right - left) / 2; |
701 | 24 | T midValue = decode_frame_min_value(mid); |
702 | 24 | if (midValue < target) { |
703 | 12 | left = mid + 1; |
704 | 12 | } else { |
705 | 12 | right = mid; |
706 | 12 | } |
707 | 24 | } |
708 | | // after loop, left is the first frame >= target |
709 | 12 | if (left == 0) { |
710 | | // all frames are >= target, not found |
711 | 4 | return _frame_count; |
712 | 4 | } |
713 | | // otherwise previous frame is the last frame < target |
714 | 8 | return left - 1; |
715 | 12 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE28seek_last_frame_before_valueEa Unexecuted instantiation: _ZN5doris10ForDecoderIsE28seek_last_frame_before_valueEs Unexecuted instantiation: _ZN5doris10ForDecoderIiE28seek_last_frame_before_valueEi _ZN5doris10ForDecoderIlE28seek_last_frame_before_valueEl Line | Count | Source | 695 | 12 | uint32_t ForDecoder<T>::seek_last_frame_before_value(T target) { | 696 | | // first of all, find the first frame >= target | 697 | 12 | uint32_t left = 0; | 698 | 12 | uint32_t right = _frame_count; | 699 | 36 | while (left < right) { | 700 | 24 | uint32_t mid = left + (right - left) / 2; | 701 | 24 | T midValue = decode_frame_min_value(mid); | 702 | 24 | if (midValue < target) { | 703 | 12 | left = mid + 1; | 704 | 12 | } else { | 705 | 12 | right = mid; | 706 | 12 | } | 707 | 24 | } | 708 | | // after loop, left is the first frame >= target | 709 | 12 | if (left == 0) { | 710 | | // all frames are >= target, not found | 711 | 4 | return _frame_count; | 712 | 4 | } | 713 | | // otherwise previous frame is the last frame < target | 714 | 8 | return left - 1; | 715 | 12 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE28seek_last_frame_before_valueEn Unexecuted instantiation: _ZN5doris10ForDecoderIhE28seek_last_frame_before_valueEh Unexecuted instantiation: _ZN5doris10ForDecoderItE28seek_last_frame_before_valueEt Unexecuted instantiation: _ZN5doris10ForDecoderIjE28seek_last_frame_before_valueEj Unexecuted instantiation: _ZN5doris10ForDecoderImE28seek_last_frame_before_valueEm Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE28seek_last_frame_before_valueES1_ Unexecuted instantiation: _ZN5doris10ForDecoderIoE28seek_last_frame_before_valueEo |
716 | | |
717 | | template <typename T> |
718 | | bool ForDecoder<T>::seek_lower_bound_inside_frame(uint32_t frame_index, T target, |
719 | 8 | bool* exact_match) { |
720 | 8 | _current_index = frame_index * _max_frame_size; |
721 | 8 | decode_current_frame(_out_buffer.data()); |
722 | 8 | auto end = _out_buffer.begin() + frame_size(frame_index); |
723 | 8 | auto pos = std::lower_bound(_out_buffer.begin(), end, target); |
724 | 8 | if (pos != end) { // found in this frame |
725 | 4 | auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos)); |
726 | 4 | *exact_match = _out_buffer[pos_in_frame] == target; |
727 | 4 | _current_index += pos_in_frame; |
728 | 4 | return true; |
729 | 4 | } |
730 | 4 | return false; |
731 | 8 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE29seek_lower_bound_inside_frameEjaPb Unexecuted instantiation: _ZN5doris10ForDecoderIsE29seek_lower_bound_inside_frameEjsPb Unexecuted instantiation: _ZN5doris10ForDecoderIiE29seek_lower_bound_inside_frameEjiPb _ZN5doris10ForDecoderIlE29seek_lower_bound_inside_frameEjlPb Line | Count | Source | 719 | 8 | bool* exact_match) { | 720 | 8 | _current_index = frame_index * _max_frame_size; | 721 | 8 | decode_current_frame(_out_buffer.data()); | 722 | 8 | auto end = _out_buffer.begin() + frame_size(frame_index); | 723 | 8 | auto pos = std::lower_bound(_out_buffer.begin(), end, target); | 724 | 8 | if (pos != end) { // found in this frame | 725 | 4 | auto pos_in_frame = cast_set<uint32_t>(std::distance(_out_buffer.begin(), pos)); | 726 | 4 | *exact_match = _out_buffer[pos_in_frame] == target; | 727 | 4 | _current_index += pos_in_frame; | 728 | 4 | return true; | 729 | 4 | } | 730 | 4 | return false; | 731 | 8 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE29seek_lower_bound_inside_frameEjnPb Unexecuted instantiation: _ZN5doris10ForDecoderIhE29seek_lower_bound_inside_frameEjhPb Unexecuted instantiation: _ZN5doris10ForDecoderItE29seek_lower_bound_inside_frameEjtPb Unexecuted instantiation: _ZN5doris10ForDecoderIjE29seek_lower_bound_inside_frameEjjPb Unexecuted instantiation: _ZN5doris10ForDecoderImE29seek_lower_bound_inside_frameEjmPb Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE29seek_lower_bound_inside_frameEjS1_Pb Unexecuted instantiation: _ZN5doris10ForDecoderIoE29seek_lower_bound_inside_frameEjoPb |
732 | | |
733 | | template <typename T> |
734 | 12 | bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) { |
735 | 12 | T target = *reinterpret_cast<const T*>(value); |
736 | 12 | uint32_t frame_to_search = seek_last_frame_before_value(target); |
737 | 12 | if (frame_to_search == _frame_count) { |
738 | | // all frames are >= target, the searched value must the be first value |
739 | 4 | _current_index = 0; |
740 | 4 | decode_current_frame(_out_buffer.data()); |
741 | 4 | *exact_match = _out_buffer[0] == target; |
742 | 4 | return true; |
743 | 4 | } |
744 | | // binary search inside the last frame < target |
745 | 8 | bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match); |
746 | | // if not found, all values in the last frame are less than target. |
747 | | // then the searched value must be the first value of the next frame. |
748 | 8 | if (!found && frame_to_search < _frame_count - 1) { |
749 | 2 | _current_index = (frame_to_search + 1) * _max_frame_size; |
750 | 2 | decode_current_frame(_out_buffer.data()); |
751 | 2 | *exact_match = _out_buffer[0] == target; |
752 | 2 | return true; |
753 | 2 | } |
754 | 6 | return found; |
755 | 8 | } Unexecuted instantiation: _ZN5doris10ForDecoderIaE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIsE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIiE22seek_at_or_after_valueEPKvPb _ZN5doris10ForDecoderIlE22seek_at_or_after_valueEPKvPb Line | Count | Source | 734 | 12 | bool ForDecoder<T>::seek_at_or_after_value(const void* value, bool* exact_match) { | 735 | 12 | T target = *reinterpret_cast<const T*>(value); | 736 | 12 | uint32_t frame_to_search = seek_last_frame_before_value(target); | 737 | 12 | if (frame_to_search == _frame_count) { | 738 | | // all frames are >= target, the searched value must the be first value | 739 | 4 | _current_index = 0; | 740 | 4 | decode_current_frame(_out_buffer.data()); | 741 | 4 | *exact_match = _out_buffer[0] == target; | 742 | 4 | return true; | 743 | 4 | } | 744 | | // binary search inside the last frame < target | 745 | 8 | bool found = seek_lower_bound_inside_frame(frame_to_search, target, exact_match); | 746 | | // if not found, all values in the last frame are less than target. | 747 | | // then the searched value must be the first value of the next frame. | 748 | 8 | if (!found && frame_to_search < _frame_count - 1) { | 749 | 2 | _current_index = (frame_to_search + 1) * _max_frame_size; | 750 | 2 | decode_current_frame(_out_buffer.data()); | 751 | 2 | *exact_match = _out_buffer[0] == target; | 752 | 2 | return true; | 753 | 2 | } | 754 | 6 | return found; | 755 | 8 | } |
Unexecuted instantiation: _ZN5doris10ForDecoderInE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIhE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderItE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIjE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderImE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderINS_8uint24_tEE22seek_at_or_after_valueEPKvPb Unexecuted instantiation: _ZN5doris10ForDecoderIoE22seek_at_or_after_valueEPKvPb |
756 | | |
757 | | template class ForEncoder<int8_t>; |
758 | | template class ForEncoder<int16_t>; |
759 | | template class ForEncoder<int32_t>; |
760 | | template class ForEncoder<int64_t>; |
761 | | template class ForEncoder<int128_t>; |
762 | | template class ForEncoder<uint8_t>; |
763 | | template class ForEncoder<uint16_t>; |
764 | | template class ForEncoder<uint32_t>; |
765 | | template class ForEncoder<uint64_t>; |
766 | | template class ForEncoder<uint24_t>; |
767 | | template class ForEncoder<uint128_t>; |
768 | | |
769 | | template class ForDecoder<int8_t>; |
770 | | template class ForDecoder<int16_t>; |
771 | | template class ForDecoder<int32_t>; |
772 | | template class ForDecoder<int64_t>; |
773 | | template class ForDecoder<int128_t>; |
774 | | template class ForDecoder<uint8_t>; |
775 | | template class ForDecoder<uint16_t>; |
776 | | template class ForDecoder<uint32_t>; |
777 | | template class ForDecoder<uint64_t>; |
778 | | template class ForDecoder<uint24_t>; |
779 | | template class ForDecoder<uint128_t>; |
780 | | #include "common/compile_check_end.h" |
781 | | } // namespace doris |