221 | 5.42M | } Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi0ELb0EEEmPKh _ZN5doris11UnpackValueILi1ELi0ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi1ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi2ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi3ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi4ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi5ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi6ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi7ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi8ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi9ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi10ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi11ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi12ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi13ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi14ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi15ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi16ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi17ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi18ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi19ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi20ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi21ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi22ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi23ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi24ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi25ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi26ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi27ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi28ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi29ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi30ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
_ZN5doris11UnpackValueILi1ELi31ELb1EEEmPKh Line | Count | Source | 176 | 571 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 571 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 571 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 571 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 571 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 571 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 571 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 571 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 571 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 571 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 571 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 571 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 571 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 571 | constexpr bool READ_32_BITS = | 203 | 571 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 571 | if (READ_32_BITS) { | 206 | 571 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 571 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 571 | return word & mask; | 209 | 571 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 571 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi1ELi23ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi22ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi21ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi20ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi19ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi18ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi17ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi16ELb0EEEmPKh Line | Count | Source | 176 | 23 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 23 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 23 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 23 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 23 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 23 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 23 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 23 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 23 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 23 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 23 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 23 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 23 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 23 | constexpr bool READ_32_BITS = | 203 | 23 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 23 | if (READ_32_BITS) { | 206 | 23 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 23 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 23 | return word & mask; | 209 | 23 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 23 | } |
_ZN5doris11UnpackValueILi1ELi15ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi14ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi13ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi12ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi11ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi10ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi9ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi8ELb0EEEmPKh Line | Count | Source | 176 | 55 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 55 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 55 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 55 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 55 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 55 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 55 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 55 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 55 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 55 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 55 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 55 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 55 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 55 | constexpr bool READ_32_BITS = | 203 | 55 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 55 | if (READ_32_BITS) { | 206 | 55 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 55 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 55 | return word & mask; | 209 | 55 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 55 | } |
_ZN5doris11UnpackValueILi1ELi7ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi6ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi5ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi4ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi3ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi2ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi1ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi1ELi0ELb0EEEmPKh Line | Count | Source | 176 | 115 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 115 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 115 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 115 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 115 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 115 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 115 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 115 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 115 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 115 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 115 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 115 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 115 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 115 | constexpr bool READ_32_BITS = | 203 | 115 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 115 | if (READ_32_BITS) { | 206 | 115 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 115 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 115 | return word & mask; | 209 | 115 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 115 | } |
_ZN5doris11UnpackValueILi2ELi0ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi1ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi2ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi3ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi4ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi5ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi6ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi7ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi8ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi9ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi10ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi11ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi12ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi13ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi14ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi15ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi16ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi17ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi18ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi19ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi20ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi21ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi22ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi23ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi24ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi25ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi26ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi27ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi28ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi29ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi30ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
_ZN5doris11UnpackValueILi2ELi31ELb1EEEmPKh Line | Count | Source | 176 | 16 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 16 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 16 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 16 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 16 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 16 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 16 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 16 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 16 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 16 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 16 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 16 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 16 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 16 | constexpr bool READ_32_BITS = | 203 | 16 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 16 | if (READ_32_BITS) { | 206 | 16 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 16 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 16 | return word & mask; | 209 | 16 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 16 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi2ELi23ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi22ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi21ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi20ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi19ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi18ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi17ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi16ELb0EEEmPKh Line | Count | Source | 176 | 19 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 19 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 19 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 19 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 19 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 19 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 19 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 19 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 19 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 19 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 19 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 19 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 19 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 19 | constexpr bool READ_32_BITS = | 203 | 19 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 19 | if (READ_32_BITS) { | 206 | 19 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 19 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 19 | return word & mask; | 209 | 19 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 19 | } |
_ZN5doris11UnpackValueILi2ELi15ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi14ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi13ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi12ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi11ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi10ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi9ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi8ELb0EEEmPKh Line | Count | Source | 176 | 34 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 34 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 34 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 34 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 34 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 34 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 34 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 34 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 34 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 34 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 34 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 34 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 34 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 34 | constexpr bool READ_32_BITS = | 203 | 34 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 34 | if (READ_32_BITS) { | 206 | 34 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 34 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 34 | return word & mask; | 209 | 34 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 34 | } |
_ZN5doris11UnpackValueILi2ELi7ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi6ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi5ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi4ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi3ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi2ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi1ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi2ELi0ELb0EEEmPKh Line | Count | Source | 176 | 124 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 124 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 124 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 124 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 124 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 124 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 124 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 124 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 124 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 124 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 124 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 124 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 124 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 124 | constexpr bool READ_32_BITS = | 203 | 124 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 124 | if (READ_32_BITS) { | 206 | 124 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 124 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 124 | return word & mask; | 209 | 124 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 124 | } |
_ZN5doris11UnpackValueILi3ELi0ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi1ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi2ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi3ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi4ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi5ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi6ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi7ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi8ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi9ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi10ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi11ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi12ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi13ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi14ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi15ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi16ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi17ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi18ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi19ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi20ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi21ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.00k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.00k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.00k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.00k | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi22ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi23ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi24ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi25ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi26ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi27ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi28ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi29ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi30ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
_ZN5doris11UnpackValueILi3ELi31ELb1EEEmPKh Line | Count | Source | 176 | 4.00k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.00k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.00k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.00k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.00k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.00k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.00k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.00k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.00k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.00k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.00k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.00k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.00k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.00k | constexpr bool READ_32_BITS = | 203 | 4.00k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.00k | if (READ_32_BITS) { | 206 | 4.00k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.00k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.00k | return word & mask; | 209 | 4.00k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.00k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi3ELi23ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi22ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi21ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 278 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 278 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 278 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 278 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi20ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi19ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi18ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi17ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi16ELb0EEEmPKh Line | Count | Source | 176 | 278 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 278 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 278 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 278 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 278 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 278 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 278 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 278 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 278 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 278 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 278 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 278 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 278 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 278 | constexpr bool READ_32_BITS = | 203 | 278 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 278 | if (READ_32_BITS) { | 206 | 278 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 278 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 278 | return word & mask; | 209 | 278 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 278 | } |
_ZN5doris11UnpackValueILi3ELi15ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi14ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi13ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi12ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi11ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi10ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 323 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 323 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 323 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 323 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi9ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi8ELb0EEEmPKh Line | Count | Source | 176 | 323 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 323 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 323 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 323 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 323 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 323 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 323 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 323 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 323 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 323 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 323 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 323 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 323 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 323 | constexpr bool READ_32_BITS = | 203 | 323 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 323 | if (READ_32_BITS) { | 206 | 323 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 323 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 323 | return word & mask; | 209 | 323 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 323 | } |
_ZN5doris11UnpackValueILi3ELi7ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi6ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi5ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi4ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi3ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi2ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi1ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi3ELi0ELb0EEEmPKh Line | Count | Source | 176 | 465 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 465 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 465 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 465 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 465 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 465 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 465 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 465 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 465 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 465 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 465 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 465 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 465 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 465 | constexpr bool READ_32_BITS = | 203 | 465 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 465 | if (READ_32_BITS) { | 206 | 465 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 465 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 465 | return word & mask; | 209 | 465 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 465 | } |
_ZN5doris11UnpackValueILi4ELi0ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi1ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi2ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi3ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi4ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi5ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi6ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi7ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi8ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi9ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi10ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi11ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi12ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi13ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi14ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi15ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi16ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi17ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi18ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi19ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi20ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi21ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi22ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi23ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi24ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi25ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi26ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi27ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi28ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi29ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi30ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
_ZN5doris11UnpackValueILi4ELi31ELb1EEEmPKh Line | Count | Source | 176 | 378 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 378 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 378 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 378 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 378 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 378 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 378 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 378 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 378 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 378 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 378 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 378 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 378 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 378 | constexpr bool READ_32_BITS = | 203 | 378 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 378 | if (READ_32_BITS) { | 206 | 378 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 378 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 378 | return word & mask; | 209 | 378 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 378 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi4ELi23ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi22ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi21ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi20ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi19ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi18ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi17ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi16ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi4ELi15ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi14ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi13ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi12ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi11ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi10ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi9ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi8ELb0EEEmPKh Line | Count | Source | 176 | 175 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 175 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 175 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 175 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 175 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 175 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 175 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 175 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 175 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 175 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 175 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 175 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 175 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 175 | constexpr bool READ_32_BITS = | 203 | 175 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 175 | if (READ_32_BITS) { | 206 | 175 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 175 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 175 | return word & mask; | 209 | 175 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 175 | } |
_ZN5doris11UnpackValueILi4ELi7ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi6ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi5ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi4ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi3ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi2ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi1ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi4ELi0ELb0EEEmPKh Line | Count | Source | 176 | 194 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 194 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 194 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 194 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 194 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 194 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 194 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 194 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 194 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 194 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 194 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 194 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 194 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 194 | constexpr bool READ_32_BITS = | 203 | 194 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 194 | if (READ_32_BITS) { | 206 | 194 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 194 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 194 | return word & mask; | 209 | 194 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 194 | } |
_ZN5doris11UnpackValueILi5ELi0ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi1ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi2ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi3ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi4ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi5ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi6ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi7ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi8ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi9ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi10ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi11ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi12ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi13ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi14ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi15ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi16ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi17ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi18ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi19ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi20ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi21ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi22ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi23ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi24ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi25ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 3.01k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 3.01k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 3.01k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 3.01k | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi26ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 3.01k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 3.01k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 3.01k | return word & mask; | 209 | 3.01k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi27ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 3.01k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 3.01k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 3.01k | return word & mask; | 209 | 3.01k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi28ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 3.01k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 3.01k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 3.01k | return word & mask; | 209 | 3.01k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi29ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 3.01k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 3.01k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 3.01k | return word & mask; | 209 | 3.01k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi30ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 3.01k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 3.01k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 3.01k | return word & mask; | 209 | 3.01k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 3.01k | } |
_ZN5doris11UnpackValueILi5ELi31ELb1EEEmPKh Line | Count | Source | 176 | 3.01k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 3.01k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 3.01k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 3.01k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 3.01k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 3.01k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 3.01k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 3.01k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 3.01k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 3.01k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 3.01k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 3.01k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 3.01k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 3.01k | constexpr bool READ_32_BITS = | 203 | 3.01k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 3.01k | if (READ_32_BITS) { | 206 | 3.01k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 3.01k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 3.01k | return word & mask; | 209 | 3.01k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 3.01k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi5ELi23ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi22ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi21ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi20ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi19ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 239 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 239 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 239 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 239 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi18ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi17ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi16ELb0EEEmPKh Line | Count | Source | 176 | 239 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 239 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 239 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 239 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 239 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 239 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 239 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 239 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 239 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 239 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 239 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 239 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 239 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 239 | constexpr bool READ_32_BITS = | 203 | 239 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 239 | if (READ_32_BITS) { | 206 | 239 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 239 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 239 | return word & mask; | 209 | 239 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 239 | } |
_ZN5doris11UnpackValueILi5ELi15ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi14ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi13ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi12ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 251 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 251 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 251 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 251 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi11ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi10ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi9ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi8ELb0EEEmPKh Line | Count | Source | 176 | 251 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 251 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 251 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 251 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 251 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 251 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 251 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 251 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 251 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 251 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 251 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 251 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 251 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 251 | constexpr bool READ_32_BITS = | 203 | 251 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 251 | if (READ_32_BITS) { | 206 | 251 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 251 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 251 | return word & mask; | 209 | 251 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 251 | } |
_ZN5doris11UnpackValueILi5ELi7ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi6ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 276 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 276 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 276 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 276 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi5ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi4ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi3ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi2ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi1ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi5ELi0ELb0EEEmPKh Line | Count | Source | 176 | 276 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 276 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 276 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 276 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 276 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 276 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 276 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 276 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 276 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 276 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 276 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 276 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 276 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 276 | constexpr bool READ_32_BITS = | 203 | 276 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 276 | if (READ_32_BITS) { | 206 | 276 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 276 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 276 | return word & mask; | 209 | 276 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 276 | } |
_ZN5doris11UnpackValueILi6ELi0ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi1ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi2ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi3ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi4ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi5ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi6ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi7ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi8ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi9ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi10ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi11ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi12ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi13ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi14ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi15ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi16ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi17ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi18ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi19ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi20ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi21ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi22ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi23ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi24ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi25ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi26ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 586 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 586 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 586 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 586 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi27ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 586 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 586 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 586 | return word & mask; | 209 | 586 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi28ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 586 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 586 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 586 | return word & mask; | 209 | 586 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi29ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 586 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 586 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 586 | return word & mask; | 209 | 586 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi30ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 586 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 586 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 586 | return word & mask; | 209 | 586 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 586 | } |
_ZN5doris11UnpackValueILi6ELi31ELb1EEEmPKh Line | Count | Source | 176 | 586 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 586 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 586 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 586 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 586 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 586 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 586 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 586 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 586 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 586 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 586 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 586 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 586 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 586 | constexpr bool READ_32_BITS = | 203 | 586 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 586 | if (READ_32_BITS) { | 206 | 586 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 586 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 586 | return word & mask; | 209 | 586 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 586 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi6ELi23ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi22ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi21ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 40 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 40 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 40 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 40 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi20ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi19ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi18ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi17ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi16ELb0EEEmPKh Line | Count | Source | 176 | 40 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 40 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 40 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 40 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 40 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 40 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 40 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 40 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 40 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 40 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 40 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 40 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 40 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 40 | constexpr bool READ_32_BITS = | 203 | 40 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 40 | if (READ_32_BITS) { | 206 | 40 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 40 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 40 | return word & mask; | 209 | 40 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 40 | } |
_ZN5doris11UnpackValueILi6ELi15ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi14ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi13ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi12ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi11ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi10ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 43 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 43 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 43 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 43 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi9ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi8ELb0EEEmPKh Line | Count | Source | 176 | 43 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 43 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 43 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 43 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 43 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 43 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 43 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 43 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 43 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 43 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 43 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 43 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 43 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 43 | constexpr bool READ_32_BITS = | 203 | 43 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 43 | if (READ_32_BITS) { | 206 | 43 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 43 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 43 | return word & mask; | 209 | 43 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 43 | } |
_ZN5doris11UnpackValueILi6ELi7ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi6ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi5ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 76 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 76 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 76 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 76 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi4ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi3ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi2ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi1ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi6ELi0ELb0EEEmPKh Line | Count | Source | 176 | 76 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 76 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 76 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 76 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 76 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 76 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 76 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 76 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 76 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 76 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 76 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 76 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 76 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 76 | constexpr bool READ_32_BITS = | 203 | 76 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 76 | if (READ_32_BITS) { | 206 | 76 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 76 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 76 | return word & mask; | 209 | 76 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 76 | } |
_ZN5doris11UnpackValueILi7ELi0ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi1ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi2ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi3ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi4ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi5ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi6ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi7ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi8ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi9ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi10ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi11ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi12ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi13ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi14ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi15ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi16ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi17ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi18ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi19ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi20ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi21ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi22ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi23ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi24ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi25ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi26ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi27ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 123 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 123 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 123 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 123 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi28ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 123 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 123 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 123 | return word & mask; | 209 | 123 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi29ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 123 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 123 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 123 | return word & mask; | 209 | 123 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi30ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 123 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 123 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 123 | return word & mask; | 209 | 123 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 123 | } |
_ZN5doris11UnpackValueILi7ELi31ELb1EEEmPKh Line | Count | Source | 176 | 123 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 123 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 123 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 123 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 123 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 123 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 123 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 123 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 123 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 123 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 123 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 123 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 123 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 123 | constexpr bool READ_32_BITS = | 203 | 123 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 123 | if (READ_32_BITS) { | 206 | 123 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 123 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 123 | return word & mask; | 209 | 123 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 123 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi8ELb0EEEmPKh _ZN5doris11UnpackValueILi7ELi7ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi6ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi5ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi4ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 41 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 41 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 41 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 41 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi3ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi2ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi1ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi7ELi0ELb0EEEmPKh Line | Count | Source | 176 | 41 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 41 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 41 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 41 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 41 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 41 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 41 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 41 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 41 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 41 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 41 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 41 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 41 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 41 | constexpr bool READ_32_BITS = | 203 | 41 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 41 | if (READ_32_BITS) { | 206 | 41 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 41 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 41 | return word & mask; | 209 | 41 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 41 | } |
_ZN5doris11UnpackValueILi8ELi0ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi1ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi2ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi3ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi4ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi5ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi6ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi7ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi8ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi9ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi10ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi11ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi12ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi13ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi14ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi15ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi16ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi17ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi18ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi19ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi20ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi21ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi22ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi23ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi24ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi25ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi26ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi27ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi28ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi29ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi30ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
_ZN5doris11UnpackValueILi8ELi31ELb1EEEmPKh Line | Count | Source | 176 | 18.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 18.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 18.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 18.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 18.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 18.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 18.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 18.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 18.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 18.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 18.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 18.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 18.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 18.7k | constexpr bool READ_32_BITS = | 203 | 18.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 18.7k | if (READ_32_BITS) { | 206 | 18.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 18.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 18.7k | return word & mask; | 209 | 18.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 18.7k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi8ELi23ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi22ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi21ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi20ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi19ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi18ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi17ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi16ELb0EEEmPKh Line | Count | Source | 176 | 1.24k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.24k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.24k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.24k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.24k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.24k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.24k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.24k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.24k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.24k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.24k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.24k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.24k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.24k | constexpr bool READ_32_BITS = | 203 | 1.24k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.24k | if (READ_32_BITS) { | 206 | 1.24k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.24k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.24k | return word & mask; | 209 | 1.24k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.24k | } |
_ZN5doris11UnpackValueILi8ELi15ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi14ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi13ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi12ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi11ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi10ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi9ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi8ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi7ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi6ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi5ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi4ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi3ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi2ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi1ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi8ELi0ELb0EEEmPKh Line | Count | Source | 176 | 1.25k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.25k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.25k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.25k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.25k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.25k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.25k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.25k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.25k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.25k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.25k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.25k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.25k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.25k | constexpr bool READ_32_BITS = | 203 | 1.25k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.25k | if (READ_32_BITS) { | 206 | 1.25k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.25k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.25k | return word & mask; | 209 | 1.25k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.25k | } |
_ZN5doris11UnpackValueILi9ELi0ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi1ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi2ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi3ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi4ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi5ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi6ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi7ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi8ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi9ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi10ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi11ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi12ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi13ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi14ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi15ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi16ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi17ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi18ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi19ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi20ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi21ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi22ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi23ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi24ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi25ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi26ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi27ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi28ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.83k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.83k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.83k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.83k | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi29ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 4.83k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.83k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.83k | return word & mask; | 209 | 4.83k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi30ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 4.83k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.83k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.83k | return word & mask; | 209 | 4.83k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.83k | } |
_ZN5doris11UnpackValueILi9ELi31ELb1EEEmPKh Line | Count | Source | 176 | 4.83k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.83k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.83k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.83k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.83k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.83k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.83k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.83k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.83k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.83k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.83k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.83k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.83k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.83k | constexpr bool READ_32_BITS = | 203 | 4.83k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.83k | if (READ_32_BITS) { | 206 | 4.83k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.83k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.83k | return word & mask; | 209 | 4.83k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.83k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi9ELi23ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi22ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi21ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 314 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 314 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 314 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 314 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi20ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi19ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi18ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi17ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 314 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 314 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 314 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 314 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi16ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi15ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi14ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 314 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 314 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 314 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 314 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi13ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi12ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi11ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi10ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 314 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 314 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 314 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 314 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi9ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi8ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi7ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 314 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 314 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 314 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 314 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi6ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi5ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi4ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi3ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 314 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 314 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 314 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 314 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi2ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi1ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi9ELi0ELb0EEEmPKh Line | Count | Source | 176 | 314 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 314 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 314 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 314 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 314 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 314 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 314 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 314 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 314 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 314 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 314 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 314 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 314 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 314 | constexpr bool READ_32_BITS = | 203 | 314 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 314 | if (READ_32_BITS) { | 206 | 314 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 314 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 314 | return word & mask; | 209 | 314 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 314 | } |
_ZN5doris11UnpackValueILi10ELi0ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi1ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi2ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi3ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi4ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi5ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi6ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi7ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi8ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi9ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi10ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi11ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi12ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi13ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi14ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi15ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi16ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi17ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi18ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi19ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi20ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi21ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi22ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi23ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi24ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi25ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi26ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi27ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi28ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 13.2k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 13.2k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 13.2k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 13.2k | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi29ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 13.2k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 13.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 13.2k | return word & mask; | 209 | 13.2k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi30ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 13.2k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 13.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 13.2k | return word & mask; | 209 | 13.2k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 13.2k | } |
_ZN5doris11UnpackValueILi10ELi31ELb1EEEmPKh Line | Count | Source | 176 | 13.2k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 13.2k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 13.2k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 13.2k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 13.2k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 13.2k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 13.2k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 13.2k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 13.2k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 13.2k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 13.2k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 13.2k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 13.2k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 13.2k | constexpr bool READ_32_BITS = | 203 | 13.2k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 13.2k | if (READ_32_BITS) { | 206 | 13.2k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 13.2k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 13.2k | return word & mask; | 209 | 13.2k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 13.2k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi10ELi23ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 875 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 875 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 875 | return word & mask; | 209 | 875 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi22ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 875 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 875 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 875 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 875 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi21ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 875 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 875 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 875 | return word & mask; | 209 | 875 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi20ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 875 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 875 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 875 | return word & mask; | 209 | 875 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi19ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 875 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 875 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 875 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 875 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi18ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 875 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 875 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 875 | return word & mask; | 209 | 875 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi17ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 875 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 875 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 875 | return word & mask; | 209 | 875 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi16ELb0EEEmPKh Line | Count | Source | 176 | 875 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 875 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 875 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 875 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 875 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 875 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 875 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 875 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 875 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 875 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 875 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 875 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 875 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 875 | constexpr bool READ_32_BITS = | 203 | 875 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 875 | if (READ_32_BITS) { | 206 | 875 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 875 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 875 | return word & mask; | 209 | 875 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 875 | } |
_ZN5doris11UnpackValueILi10ELi15ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 882 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 882 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 882 | return word & mask; | 209 | 882 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi14ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 882 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 882 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 882 | return word & mask; | 209 | 882 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi13ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 882 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 882 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 882 | return word & mask; | 209 | 882 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi12ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 882 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 882 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 882 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 882 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi11ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 882 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 882 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 882 | return word & mask; | 209 | 882 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi10ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 882 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 882 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 882 | return word & mask; | 209 | 882 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi9ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 882 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 882 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 882 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 882 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi8ELb0EEEmPKh Line | Count | Source | 176 | 882 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 882 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 882 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 882 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 882 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 882 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 882 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 882 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 882 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 882 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 882 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 882 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 882 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 882 | constexpr bool READ_32_BITS = | 203 | 882 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 882 | if (READ_32_BITS) { | 206 | 882 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 882 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 882 | return word & mask; | 209 | 882 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 882 | } |
_ZN5doris11UnpackValueILi10ELi7ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 884 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 884 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 884 | return word & mask; | 209 | 884 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi6ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 884 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 884 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 884 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 884 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi5ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 884 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 884 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 884 | return word & mask; | 209 | 884 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi4ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 884 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 884 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 884 | return word & mask; | 209 | 884 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi3ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 884 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 884 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 884 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 884 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi2ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 884 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 884 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 884 | return word & mask; | 209 | 884 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi1ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 884 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 884 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 884 | return word & mask; | 209 | 884 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi10ELi0ELb0EEEmPKh Line | Count | Source | 176 | 884 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 884 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 884 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 884 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 884 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 884 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 884 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 884 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 884 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 884 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 884 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 884 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 884 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 884 | constexpr bool READ_32_BITS = | 203 | 884 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 884 | if (READ_32_BITS) { | 206 | 884 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 884 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 884 | return word & mask; | 209 | 884 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 884 | } |
_ZN5doris11UnpackValueILi11ELi0ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi1ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi2ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi3ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi4ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi5ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi6ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi7ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi8ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi9ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi10ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi11ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi12ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi13ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi14ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi15ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi16ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi17ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi18ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi19ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi20ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi21ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi22ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi23ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi24ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi25ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi26ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi27ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi28ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi29ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 38.1k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 38.1k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 38.1k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 38.1k | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi30ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 38.1k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 38.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 38.1k | return word & mask; | 209 | 38.1k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 38.1k | } |
_ZN5doris11UnpackValueILi11ELi31ELb1EEEmPKh Line | Count | Source | 176 | 38.1k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 38.1k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 38.1k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 38.1k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 38.1k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 38.1k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 38.1k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 38.1k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 38.1k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 38.1k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 38.1k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 38.1k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 38.1k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 38.1k | constexpr bool READ_32_BITS = | 203 | 38.1k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 38.1k | if (READ_32_BITS) { | 206 | 38.1k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 38.1k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 38.1k | return word & mask; | 209 | 38.1k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 38.1k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi11ELi23ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.50k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.50k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.50k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.50k | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi22ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 2.50k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.50k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.50k | return word & mask; | 209 | 2.50k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi21ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 2.50k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.50k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.50k | return word & mask; | 209 | 2.50k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi20ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.50k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.50k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.50k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.50k | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi19ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 2.50k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.50k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.50k | return word & mask; | 209 | 2.50k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi18ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 2.50k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.50k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.50k | return word & mask; | 209 | 2.50k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi17ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.50k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.50k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.50k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.50k | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi16ELb0EEEmPKh Line | Count | Source | 176 | 2.50k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.50k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.50k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.50k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.50k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.50k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.50k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.50k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.50k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.50k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.50k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.50k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.50k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.50k | constexpr bool READ_32_BITS = | 203 | 2.50k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.50k | if (READ_32_BITS) { | 206 | 2.50k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.50k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.50k | return word & mask; | 209 | 2.50k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.50k | } |
_ZN5doris11UnpackValueILi11ELi15ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi14ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.51k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.51k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.51k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.51k | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi13ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi12ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi11ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.51k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.51k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.51k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.51k | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi10ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi9ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi8ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.51k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.51k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.51k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.51k | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi7ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi6ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi5ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.51k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.51k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.51k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.51k | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi4ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi3ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi2ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 2.51k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 2.51k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 2.51k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 2.51k | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi1ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi11ELi0ELb0EEEmPKh Line | Count | Source | 176 | 2.51k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 2.51k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 2.51k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 2.51k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 2.51k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 2.51k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 2.51k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 2.51k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 2.51k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 2.51k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 2.51k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 2.51k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 2.51k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 2.51k | constexpr bool READ_32_BITS = | 203 | 2.51k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 2.51k | if (READ_32_BITS) { | 206 | 2.51k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 2.51k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 2.51k | return word & mask; | 209 | 2.51k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 2.51k | } |
_ZN5doris11UnpackValueILi12ELi0ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi1ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi2ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi3ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi4ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi5ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi6ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi7ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi8ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi9ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi10ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi11ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi12ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi13ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi14ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi15ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi16ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi17ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi18ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi19ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi20ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi21ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi22ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi23ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi24ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi25ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi26ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi27ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi28ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi29ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 71.7k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 71.7k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 71.7k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 71.7k | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi30ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 71.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 71.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 71.7k | return word & mask; | 209 | 71.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 71.7k | } |
_ZN5doris11UnpackValueILi12ELi31ELb1EEEmPKh Line | Count | Source | 176 | 71.7k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 71.7k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 71.7k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 71.7k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 71.7k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 71.7k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 71.7k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 71.7k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 71.7k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 71.7k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 71.7k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 71.7k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 71.7k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 71.7k | constexpr bool READ_32_BITS = | 203 | 71.7k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 71.7k | if (READ_32_BITS) { | 206 | 71.7k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 71.7k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 71.7k | return word & mask; | 209 | 71.7k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 71.7k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi12ELi23ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 4.76k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.76k | return word & mask; | 209 | 4.76k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi22ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 4.76k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.76k | return word & mask; | 209 | 4.76k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi21ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.76k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.76k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.76k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.76k | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi20ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 4.76k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.76k | return word & mask; | 209 | 4.76k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi19ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 4.76k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.76k | return word & mask; | 209 | 4.76k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi18ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.76k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.76k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.76k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.76k | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi17ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 4.76k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.76k | return word & mask; | 209 | 4.76k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi16ELb0EEEmPKh Line | Count | Source | 176 | 4.76k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.76k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.76k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.76k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.76k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.76k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.76k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.76k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.76k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.76k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.76k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.76k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.76k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.76k | constexpr bool READ_32_BITS = | 203 | 4.76k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.76k | if (READ_32_BITS) { | 206 | 4.76k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.76k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.76k | return word & mask; | 209 | 4.76k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.76k | } |
_ZN5doris11UnpackValueILi12ELi15ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 4.77k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.77k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.77k | return word & mask; | 209 | 4.77k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi14ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 4.77k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.77k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.77k | return word & mask; | 209 | 4.77k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi13ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.77k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.77k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.77k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.77k | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi12ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 4.77k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.77k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.77k | return word & mask; | 209 | 4.77k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi11ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 4.77k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.77k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.77k | return word & mask; | 209 | 4.77k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi10ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.77k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.77k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.77k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.77k | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi9ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 4.77k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.77k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.77k | return word & mask; | 209 | 4.77k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi8ELb0EEEmPKh Line | Count | Source | 176 | 4.77k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.77k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.77k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.77k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.77k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.77k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.77k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.77k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.77k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.77k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.77k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.77k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.77k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.77k | constexpr bool READ_32_BITS = | 203 | 4.77k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.77k | if (READ_32_BITS) { | 206 | 4.77k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.77k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.77k | return word & mask; | 209 | 4.77k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.77k | } |
_ZN5doris11UnpackValueILi12ELi7ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 4.78k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.78k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.78k | return word & mask; | 209 | 4.78k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi6ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 4.78k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.78k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.78k | return word & mask; | 209 | 4.78k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi5ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.78k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.78k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.78k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.78k | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi4ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 4.78k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.78k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.78k | return word & mask; | 209 | 4.78k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi3ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 4.78k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.78k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.78k | return word & mask; | 209 | 4.78k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi2ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 4.78k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 4.78k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 4.78k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 4.78k | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi1ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 4.78k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.78k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.78k | return word & mask; | 209 | 4.78k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.78k | } |
_ZN5doris11UnpackValueILi12ELi0ELb0EEEmPKh Line | Count | Source | 176 | 4.78k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.78k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.78k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.78k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.78k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.78k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.78k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.78k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.78k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.78k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.78k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.78k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.78k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.78k | constexpr bool READ_32_BITS = | 203 | 4.78k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.78k | if (READ_32_BITS) { | 206 | 4.78k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.78k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.78k | return word & mask; | 209 | 4.78k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.78k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi0ELb0EEEmPKh _ZN5doris11UnpackValueILi14ELi0ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi1ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi2ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi3ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi4ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi5ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi6ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi7ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi8ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi9ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi10ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi11ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi12ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi13ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi14ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi15ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi16ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi17ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi18ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi19ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi20ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi21ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi22ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi23ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi24ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi25ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi26ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi27ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi28ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi29ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 1.48k | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 1.48k | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 1.48k | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 1.48k | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi30ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 1.48k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.48k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.48k | return word & mask; | 209 | 1.48k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.48k | } |
_ZN5doris11UnpackValueILi14ELi31ELb1EEEmPKh Line | Count | Source | 176 | 1.48k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 1.48k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 1.48k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 1.48k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 1.48k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 1.48k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 1.48k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 1.48k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 1.48k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 1.48k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 1.48k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 1.48k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 1.48k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 1.48k | constexpr bool READ_32_BITS = | 203 | 1.48k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 1.48k | if (READ_32_BITS) { | 206 | 1.48k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 1.48k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 1.48k | return word & mask; | 209 | 1.48k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 1.48k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi14ELi23ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 97 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 97 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 97 | return word & mask; | 209 | 97 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi22ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 97 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 97 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 97 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 97 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi21ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 97 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 97 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 97 | return word & mask; | 209 | 97 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi20ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 97 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 97 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 97 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 97 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi19ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 97 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 97 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 97 | return word & mask; | 209 | 97 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi18ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 97 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 97 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 97 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 97 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi17ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 97 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 97 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 97 | return word & mask; | 209 | 97 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi16ELb0EEEmPKh Line | Count | Source | 176 | 97 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 97 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 97 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 97 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 97 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 97 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 97 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 97 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 97 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 97 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 97 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 97 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 97 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 97 | constexpr bool READ_32_BITS = | 203 | 97 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 97 | if (READ_32_BITS) { | 206 | 97 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 97 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 97 | return word & mask; | 209 | 97 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 97 | } |
_ZN5doris11UnpackValueILi14ELi15ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi14ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi13ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 99 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 99 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 99 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 99 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi12ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi11ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 99 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 99 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 99 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 99 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi10ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi9ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 99 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 99 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 99 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 99 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi8ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi7ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi6ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 99 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 99 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 99 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 99 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi5ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi4ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 99 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 99 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 99 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 99 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi3ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi2ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 0 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 0 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 0 | return word & mask; | 209 | 0 | } | 210 | | | 211 | 99 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 99 | word >>= FIRST_BIT_OFFSET; | 213 | | | 214 | 99 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | | | 220 | 99 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi1ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
_ZN5doris11UnpackValueILi14ELi0ELb0EEEmPKh Line | Count | Source | 176 | 99 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 99 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 99 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 99 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 99 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 99 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 99 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 99 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 99 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 99 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 99 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 99 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 99 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 99 | constexpr bool READ_32_BITS = | 203 | 99 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 99 | if (READ_32_BITS) { | 206 | 99 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 99 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 99 | return word & mask; | 209 | 99 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 99 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi0ELb0EEEmPKh _ZN5doris11UnpackValueILi16ELi0ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi1ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi2ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi3ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi4ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi5ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi6ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi7ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi8ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi9ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi10ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi11ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi12ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi13ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi14ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi15ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi16ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi17ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi18ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi19ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi20ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi21ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi22ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi23ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi24ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi25ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi26ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi27ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi28ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi29ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi30ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
_ZN5doris11UnpackValueILi16ELi31ELb1EEEmPKh Line | Count | Source | 176 | 4.18k | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 4.18k | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 4.18k | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 4.18k | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 4.18k | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 4.18k | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 4.18k | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 4.18k | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 4.18k | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 4.18k | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 4.18k | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 4.18k | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 4.18k | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 4.18k | constexpr bool READ_32_BITS = | 203 | 4.18k | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 4.18k | if (READ_32_BITS) { | 206 | 4.18k | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 4.18k | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 4.18k | return word & mask; | 209 | 4.18k | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 4.18k | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi24ELb0EEEmPKh _ZN5doris11UnpackValueILi16ELi23ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi22ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi21ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi20ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi19ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi18ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi17ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi16ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi15ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi14ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi13ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi12ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi11ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi10ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi9ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi8ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi7ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi6ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi5ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi4ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi3ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi2ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi1ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
_ZN5doris11UnpackValueILi16ELi0ELb0EEEmPKh Line | Count | Source | 176 | 280 | uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) { | 177 | 280 | if (BIT_WIDTH == 0) return 0; | 178 | | | 179 | 280 | constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; | 180 | 280 | constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; | 181 | 280 | constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; | 182 | 280 | constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX); | 183 | 280 | constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; | 184 | 280 | static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); | 185 | | | 186 | 280 | constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; | 187 | 280 | constexpr uint64_t mask = GetMask(BIT_WIDTH); | 188 | 280 | const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); | 189 | | | 190 | | // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that | 191 | | // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is | 192 | | // enough space in the buffer from the current reading point. | 193 | | // We try to read 64 bits even when it is not necessary because the benchmarks show it | 194 | | // is faster. | 195 | 280 | constexpr bool CAN_SAFELY_READ_64_BITS = | 196 | 280 | FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; | 197 | | | 198 | | // We do not try to read 64 bits when the bit width is a power of two (unless it is | 199 | | // necessary) because performance benchmarks show that it is better this way. This seems | 200 | | // to be due to compiler optimisation issues, so we can revisit it when we update the | 201 | | // compiler version. | 202 | 280 | constexpr bool READ_32_BITS = | 203 | 280 | WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH)); | 204 | | | 205 | 280 | if (READ_32_BITS) { | 206 | 280 | uint32_t word = in[FIRST_WORD_IDX]; | 207 | 280 | word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; | 208 | 280 | return word & mask; | 209 | 280 | } | 210 | | | 211 | 0 | uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); | 212 | 0 | word >>= FIRST_BIT_OFFSET; | 213 | |
| 214 | 0 | if (WORDS_TO_READ > 2) { | 215 | 0 | constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; | 216 | 0 | uint64_t extra_word = in[FIRST_WORD_IDX + 2]; | 217 | 0 | word |= extra_word << USEFUL_BITS; | 218 | 0 | } | 219 | |
| 220 | 0 | return word & mask; | 221 | 280 | } |
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi31ELb1EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb0EEEmPKh Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb0EEEmPKh |